updating the documentation

This commit is contained in:
Alejandro Moreo Fernandez 2021-12-06 18:25:47 +01:00
parent 1f591ec105
commit 2bd47f0841
9 changed files with 1197 additions and 218 deletions

View File

@ -255,12 +255,8 @@
<li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.device">(quapy.method.neural.QuaNetModule property)</a> <li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.device">(quapy.method.neural.QuaNetModule property)</a>
</li> </li>
</ul></li> </ul></li>
<li><a href="quapy.data.html#quapy.data.datasets.df_replace">df_replace() (in module quapy.data.datasets)</a>
</li>
<li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.dimensions">dimensions() (quapy.classification.neural.TextClassifierNet method)</a> <li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.dimensions">dimensions() (quapy.classification.neural.TextClassifierNet method)</a>
</li> </li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.classification.html#quapy.classification.neural.CNNnet.document_embedding">document_embedding() (quapy.classification.neural.CNNnet method)</a> <li><a href="quapy.classification.html#quapy.classification.neural.CNNnet.document_embedding">document_embedding() (quapy.classification.neural.CNNnet method)</a>
<ul> <ul>
@ -269,6 +265,8 @@
<li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.document_embedding">(quapy.classification.neural.TextClassifierNet method)</a> <li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.document_embedding">(quapy.classification.neural.TextClassifierNet method)</a>
</li> </li>
</ul></li> </ul></li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.html#quapy.util.download_file">download_file() (in module quapy.util)</a> <li><a href="quapy.html#quapy.util.download_file">download_file() (in module quapy.util)</a>
</li> </li>
<li><a href="quapy.html#quapy.util.download_file_if_not_exists">download_file_if_not_exists() (in module quapy.util)</a> <li><a href="quapy.html#quapy.util.download_file_if_not_exists">download_file_if_not_exists() (in module quapy.util)</a>
@ -462,19 +460,15 @@
<table style="width: 100%" class="indextable genindextable"><tr> <table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul> <td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.data.html#quapy.data.preprocessing.index">index() (in module quapy.data.preprocessing)</a> <li><a href="quapy.data.html#quapy.data.preprocessing.index">index() (in module quapy.data.preprocessing)</a>
<ul>
<li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer.index">(quapy.data.preprocessing.IndexTransformer method)</a>
</li> </li>
</ul></li>
<li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer">IndexTransformer (class in quapy.data.preprocessing)</a> <li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer">IndexTransformer (class in quapy.data.preprocessing)</a>
</li> </li>
<li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.init_hidden">init_hidden() (quapy.method.neural.QuaNetModule method)</a> <li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.init_hidden">init_hidden() (quapy.method.neural.QuaNetModule method)</a>
</li> </li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.method.html#quapy.method.base.isaggregative">isaggregative() (in module quapy.method.base)</a> <li><a href="quapy.method.html#quapy.method.base.isaggregative">isaggregative() (in module quapy.method.base)</a>
</li> </li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.html#quapy.isbinary">isbinary() (in module quapy)</a> <li><a href="quapy.html#quapy.isbinary">isbinary() (in module quapy)</a>
<ul> <ul>

Binary file not shown.

View File

@ -63,45 +63,144 @@
<dt class="sig sig-object py" id="quapy.data.base.Dataset"> <dt class="sig sig-object py" id="quapy.data.base.Dataset">
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">Dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">training</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">vocabulary</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">Dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">training</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">vocabulary</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p> <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Abstraction of training and test <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> objects.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>training</strong> a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
<li><p><strong>test</strong> a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
<li><p><strong>vocabulary</strong> if indicated, is a dictionary of the terms used in this textual dataset</p></li>
<li><p><strong>name</strong> a string representing the name of the dataset</p></li>
</ul>
</dd>
</dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.SplitStratified"> <dt class="sig sig-object py" id="quapy.data.base.Dataset.SplitStratified">
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">SplitStratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">collection</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">train_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.SplitStratified" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">SplitStratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">collection</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">train_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.SplitStratified" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Generates a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> from a stratified split of a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance.
See <a class="reference internal" href="#quapy.data.base.LabelledCollection.split_stratified" title="quapy.data.base.LabelledCollection.split_stratified"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.split_stratified()</span></code></a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>collection</strong> <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p></li>
<li><p><strong>train_size</strong> the proportion of training documents (the rest conforms the test split)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py property"> <dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.binary"> <dt class="sig sig-object py" id="quapy.data.base.Dataset.binary">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.Dataset.binary" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.Dataset.binary" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns True if the training collection is labelled according to two classes</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>boolean</p>
</dd>
</dl>
</dd></dl>
<dl class="py property"> <dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.classes_"> <dt class="sig sig-object py" id="quapy.data.base.Dataset.classes_">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.data.base.Dataset.classes_" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.data.base.Dataset.classes_" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>The classes according to which the training collection is labelled</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>The classes according to which the training collection is labelled</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.kFCV"> <dt class="sig sig-object py" id="quapy.data.base.Dataset.kFCV">
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.kFCV" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.kFCV" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
<a class="reference internal" href="#quapy.data.base.LabelledCollection.kFCV" title="quapy.data.base.LabelledCollection.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.kFCV()</span></code></a> that returns <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instances made of training and test folds.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>nfolds</strong> integer (default 5), the number of folds to generate</p></li>
<li><p><strong>nrepeats</strong> integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
<li><p><strong>random_state</strong> integer (default 0), guarantees that the folds generated are reproducible</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation as instances of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.load"> <dt class="sig sig-object py" id="quapy.data.base.Dataset.load">
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Loads a training and a test labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instance.
The function in charge of reading the instances must be specified. This function can be a custom one, or any of
the reading functions defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>train_path</strong> string, the path to the file containing the training instances</p></li>
<li><p><strong>test_path</strong> string, the path to the file containing the test instances</p></li>
<li><p><strong>loader_func</strong> a custom function that implements the data loader and returns a tuple with instances and
labels</p></li>
<li><p><strong>classes</strong> array-like, the classes according to which the instances are labelled</p></li>
<li><p><strong>loader_kwargs</strong> any argument that the <cite>loader_func</cite> function needs in order to read the instances.
See <a class="reference internal" href="#quapy.data.base.LabelledCollection.load" title="quapy.data.base.LabelledCollection.load"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.load()</span></code></a> for further details.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object</p>
</dd>
</dl>
</dd></dl>
<dl class="py property"> <dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.n_classes"> <dt class="sig sig-object py" id="quapy.data.base.Dataset.n_classes">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.Dataset.n_classes" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.Dataset.n_classes" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>The number of classes according to which the training collection is labelled</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.stats"> <dt class="sig sig-object py" id="quapy.data.base.Dataset.stats">
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Dataset</span><span class="o">=</span><span class="n">kindle</span> <span class="c1">#tr-instances=3821, #te-instances=21591, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>show</strong> if set to True (default), prints the stats in standard output</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a dictionary containing some stats of this collection for the training and test collections. The keys
are <cite>train</cite> and <cite>test</cite>, and point to dedicated dictionaries of stats, for each collection, with keys
<cite>#instances</cite> (the number of instances), <cite>type</cite> (the type representing the instances),
<cite>#features</cite> (the number of features, if the instances are in array-like format), <cite>#classes</cite> (the classes of
the collection), <cite>prevs</cite> (the prevalence values for each class)</p>
</dd>
</dl>
</dd></dl>
<dl class="py property"> <dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.vocabulary_size"> <dt class="sig sig-object py" id="quapy.data.base.Dataset.vocabulary_size">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">vocabulary_size</span></span><a class="headerlink" href="#quapy.data.base.Dataset.vocabulary_size" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">vocabulary_size</span></span><a class="headerlink" href="#quapy.data.base.Dataset.vocabulary_size" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
</dd></dl> </dd></dl>
@ -109,161 +208,480 @@
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection">
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">LabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">LabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p> <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>A LabelledCollection is a set of objects each with a label associated to it.</p> <p>A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
routines.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>instances</strong> array-like (np.ndarray, list, or csr_matrix are supported)</p></li>
<li><p><strong>labels</strong> array-like with the same length of instances</p></li>
<li><p><strong>classes</strong> optional, list of classes from which labels are taken. If not specified, the classes are inferred
from the labels. The classes must be indicated in cases in which some of the labels might have no examples
(i.e., a prevalence of 0)</p></li>
</ul>
</dd>
</dl>
<dl class="py property"> <dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.Xy"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.Xy">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">Xy</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.Xy" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">Xy</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.Xy" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Gets the instances and labels. This is useful when working with <cite>sklearn</cite> estimators, e.g.:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span> <span class="o">=</span> <span class="n">LinearSVC</span><span class="p">()</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="o">*</span><span class="n">my_collection</span><span class="o">.</span><span class="n">Xy</span><span class="p">)</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>a tuple <cite>(instances, labels)</cite> from this collection</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_generator"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_generator">
<span class="sig-name descname"><span class="pre">artificial_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_generator" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">artificial_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_generator" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
[1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of samples for each valid
combination of prevalence values is indicated by <cite>repeats</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>sample_size</strong> the number of instances in each sample</p></li>
<li><p><strong>n_prevalences</strong> the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
<li><p><strong>repeats</strong> the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yield samples generated at artificially controlled prevalence values</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_index_generator"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_index_generator">
<span class="sig-name descname"><span class="pre">artificial_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_index_generator" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">artificial_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_index_generator" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>A generator of sample indexes implementing the artificial prevalence protocol (APP).
The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
[1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of sample indexes for each valid
combination of prevalence values is indicated by <cite>repeats</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>sample_size</strong> the number of instances in each sample (i.e., length of each index)</p></li>
<li><p><strong>n_prevalences</strong> the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
<li><p><strong>repeats</strong> the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yield the indexes that generate the samples according to APP</p>
</dd>
</dl>
</dd></dl>
<dl class="py property"> <dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.binary"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.binary">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.binary" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.binary" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns True if the number of classes is 2</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>boolean</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.counts"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.counts">
<span class="sig-name descname"><span class="pre">counts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.counts" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">counts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.counts" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns the number of instances for each of the classes of interest.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the number of instances of each class, in the same order
as listed by <cite>self.classes_</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.kFCV"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.kFCV">
<span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.kFCV" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.kFCV" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Generator of stratified folds to be used in k-fold cross validation.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>nfolds</strong> integer (default 5), the number of folds to generate</p></li>
<li><p><strong>nrepeats</strong> integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
<li><p><strong>random_state</strong> integer (default 0), guarantees that the folds generated are reproducible</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.load"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.load">
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Loads a labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance. The function in charge
of reading the instances must be specified. This function can be a custom one, or any of the reading functions
defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> string, the path to the file containing the labelled instances</p></li>
<li><p><strong>loader_func</strong> a custom function that implements the data loader and returns a tuple with instances and
labels</p></li>
<li><p><strong>classes</strong> array-like, the classes according to which the instances are labelled</p></li>
<li><p><strong>loader_kwargs</strong> any argument that the <cite>loader_func</cite> function needs in order to read the instances, i.e.,
these arguments are used to call <cite>loader_func(path, **loader_kwargs)</cite></p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
</dd>
</dl>
</dd></dl>
<dl class="py property"> <dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.n_classes"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.n_classes">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.n_classes" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.n_classes" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>The number of classes</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_generator"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_generator">
<span class="sig-name descname"><span class="pre">natural_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_generator" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">natural_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_generator" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>sample_size</strong> integer, the number of instances in each sample</p></li>
<li><p><strong>repeats</strong> the number of samples to generate</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yield instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_index_generator"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_index_generator">
<span class="sig-name descname"><span class="pre">natural_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_index_generator" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">natural_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_index_generator" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>sample_size</strong> integer, the number of instances in each sample (i.e., the length of each index)</p></li>
<li><p><strong>repeats</strong> the number of indexes to generate</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yield <cite>repeats</cite> instances of np.ndarray with shape <cite>(sample_size,)</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.prevalence"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.prevalence">
<span class="sig-name descname"><span class="pre">prevalence</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.prevalence" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">prevalence</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.prevalence" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns the prevalence, or relative frequency, of the classes of interest.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the relative frequencies of each class, in the same order
as listed by <cite>self.classes_</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling">
<span class="sig-name descname"><span class="pre">sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Return a random sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size and desired prevalence
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>size</strong> integer, the requested size</p></li>
<li><p><strong>prevs</strong> the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
<li><p><strong>shuffle</strong> if set to True (default), shuffles the index before returning it</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite> and prevalence close to <cite>prevs</cite> (or
prevalence == <cite>prevs</cite> if the exact prevalence values can be met as proportions of instances)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_from_index"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_from_index">
<span class="sig-name descname"><span class="pre">sampling_from_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">index</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_from_index" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">sampling_from_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">index</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_from_index" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> whose elements are sampled from this collection using the
index.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>index</strong> np.ndarray</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_index"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_index">
<span class="sig-name descname"><span class="pre">sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_index" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_index" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling.
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>size</strong> integer, the requested size</p></li>
<li><p><strong>prevs</strong> the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
<li><p><strong>shuffle</strong> if set to True (default), shuffles the index before returning it</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.split_stratified"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.split_stratified">
<span class="sig-name descname"><span class="pre">split_stratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_prop</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.split_stratified" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">split_stratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_prop</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.split_stratified" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> split with stratification from this collection, at desired
proportion.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>train_prop</strong> the proportion of elements to include in the left-most returned collection (typically used
as the training collection). The rest of elements are included in the right-most returned collection
(typically used as a test collection).</p></li>
<li><p><strong>random_state</strong> if specified, guarantees reproducibility of the split.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>, the first one with <cite>train_prop</cite> elements, and the
second one with <cite>1-train_prop</cite> elements</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.stats"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.stats">
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.stats" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.stats" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">training</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1">#instances=3821, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>show</strong> if set to True (default), prints the stats in standard output</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a dictionary containing some stats of this collection. Keys include <cite>#instances</cite> (the number of
instances), <cite>type</cite> (the type representing the instances), <cite>#features</cite> (the number of features, if the
instances are in array-like format), <cite>#classes</cite> (the classes of the collection), <cite>prevs</cite> (the prevalence
values for each class)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling">
<span class="sig-name descname"><span class="pre">uniform_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">uniform_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns a uniform sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>size</strong> integer, the requested size</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling_index"> <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling_index">
<span class="sig-name descname"><span class="pre">uniform_sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling_index" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">uniform_sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling_index" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>size</strong> integer, the size of the uniform sample</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
</dd>
</dl>
</dd></dl>
</dd></dl> </dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.base.isbinary"> <dt class="sig sig-object py" id="quapy.data.base.isbinary">
<span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">isbinary</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.isbinary" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">isbinary</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.isbinary" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Returns True if <cite>data</cite> is either a binary <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a binary <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>data</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>True if labelled according to two classes</p>
</dd>
</dl>
</dd></dl>
</section> </section>
<section id="module-quapy.data.datasets"> <section id="module-quapy.data.datasets">
<span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Permalink to this headline"></a></h2> <span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Permalink to this headline"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.df_replace">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">df_replace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">df</span></em>, <em class="sig-param"><span class="pre">col</span></em>, <em class="sig-param"><span class="pre">repl={'no':</span> <span class="pre">0</span></em>, <em class="sig-param"><span class="pre">'yes':</span> <span class="pre">1}</span></em>, <em class="sig-param"><span class="pre">astype=&lt;class</span> <span class="pre">'float'&gt;</span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.datasets.df_replace" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIDataset"> <dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIDataset">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIDataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_split</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCIDataset" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIDataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_split</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCIDataset" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Loads a UCI dataset as an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>, as used in
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100.</a>
and
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15.</a>.
The datasets do not come with a predefined train-test split (see <a class="reference internal" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="quapy.data.datasets.fetch_UCILabelledCollection"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fetch_UCILabelledCollection()</span></code></a> for further
information on how to use these collections), and so a train-test split is generated at desired proportion.
The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> a dataset name</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>test_split</strong> proportion of documents to be included in the test set. The rest conforms the training set</p></li>
<li><p><strong>verbose</strong> set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCILabelledCollection"> <dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCILabelledCollection">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCILabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCILabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Loads a UCI collection as an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, as used in
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100.</a>
and
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15.</a>.
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
This can be reproduced by using <a class="reference internal" href="#quapy.data.base.Dataset.kFCV" title="quapy.data.base.Dataset.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.data.base.Dataset.kFCV()</span></code></a>, e.g.:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">collection</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCILabelledCollection</span><span class="p">(</span><span class="s2">&quot;yeast&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">data</span> <span class="ow">in</span> <span class="n">qp</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">Dataset</span><span class="o">.</span><span class="n">kFCV</span><span class="p">(</span><span class="n">collection</span><span class="p">,</span> <span class="n">nfolds</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">nrepeats</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
<span class="gp">&gt;&gt;&gt; </span> <span class="o">...</span>
</pre></div>
</div>
<p>The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> a dataset name</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>test_split</strong> proportion of documents to be included in the test set. The rest conforms the training set</p></li>
<li><p><strong>verbose</strong> set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_reviews"> <dt class="sig sig-object py" id="quapy.data.datasets.fetch_reviews">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_reviews</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tfidf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_reviews" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_reviews</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tfidf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_reviews" title="Permalink to this definition"></a></dt>
<dd><p>Load a Reviews dataset as a Dataset instance, as used in: <dd><p>Loads a Reviews dataset as a Dataset instance, as used in
Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.” <a class="reference external" href="https://dl.acm.org/doi/abs/10.1145/3269206.3269287">Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.</a>.
:param dataset_name: the name of the dataset: valid ones are hp, kindle, imdb The list of valid dataset names can be accessed in <cite>quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS</cite></p>
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices <dl class="field-list simple">
:param min_df: minimun number of documents that should contain a term in order for the term to be <dt class="field-odd">Parameters</dt>
kept (ignored if tfidf==False) <dd class="field-odd"><ul class="simple">
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default <li><p><strong>dataset_name</strong> the name of the dataset: valid ones are hp, kindle, imdb</p></li>
~/quay_data/ directory) <li><p><strong>tfidf</strong> set to True to transform the raw documents into tfidf weighted matrices</p></li>
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for <li><p><strong>min_df</strong> minimun number of documents that should contain a term in order for the term to be
faster subsequent invokations kept (ignored if tfidf==False)</p></li>
:return: a Dataset instance</p> <li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>pickle</strong> set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl> </dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_twitter"> <dt class="sig sig-object py" id="quapy.data.datasets.fetch_twitter">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_twitter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_twitter" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_twitter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_twitter" title="Permalink to this definition"></a></dt>
<dd><p>Load a Twitter dataset as a Dataset instance, as used in: <dd><p>Loads a Twitter dataset as a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance, as used in:
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. <a class="reference external" href="https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf">Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016) Social Network Analysis and Mining6(19), 122 (2016)</a>
The datasets semeval13, semeval14, semeval15 share the same training set.</p> Note that the datasets semeval13, semeval14, semeval15 share the same training set.
The list of valid dataset names corresponding to training sets can be accessed in
<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN</cite>, while the test sets can be accessed in
<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST</cite></p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>dataset_name</strong> the name of the dataset: valid ones are gasp, hcr, omd, sanders, semeval13,</p> <dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> the name of the dataset: valid ones are gasp, hcr, omd, sanders, semeval13,
semeval14, semeval15, semeval16, sst, wa, wb</p></li>
<li><p><strong>for_model_selection</strong> if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set</p></li>
<li><p><strong>min_df</strong> minimun number of documents that should contain a term in order for the term to be kept</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>pickle</strong> set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd> </dd>
</dl> </dl>
<p>semeval14, semeval15, semeval16, sst, wa, wb
:param for_model_selection: if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance</p>
</dd></dl> </dd></dl>
<dl class="py function"> <dl class="py function">
@ -278,15 +696,41 @@ faster subsequent invokations
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer"> <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer">
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">IndexTransformer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer" title="Permalink to this definition"></a></dt> <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">IndexTransformer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p> <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>This class implements a sklearns-style transformer that indexes text as numerical ids for the tokens it
contains, and that would be generated by sklearns
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>kwargs</strong> <p>keyworded arguments from <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
</p>
</dd>
</dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.add_word"> <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.add_word">
<span class="sig-name descname"><span class="pre">add_word</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">word</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nogaps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.add_word" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">add_word</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">word</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nogaps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.add_word" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
Useful to define special tokens for codifying unknown words, or padding tokens.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>word</strong> string, surface form of the token</p></li>
<li><p><strong>id</strong> integer, numerical value to assign to the token (leave as None for indicating the next valid id,
default)</p></li>
<li><p><strong>nogaps</strong> if set to True (default) asserts that the id indicated leads to no numerical gaps with
precedent ids stored so far</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>integer, the numerical id for the new token</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit"> <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit">
<span class="sig-name descname"><span class="pre">fit</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">fit</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit" title="Permalink to this definition"></a></dt>
<dd><dl class="field-list simple"> <dd><p>Fits the transformer, i.e., decides on the vocabulary, given a list of strings.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>X</strong> a list of strings</p> <dd class="field-odd"><p><strong>X</strong> a list of strings</p>
</dd> </dd>
@ -299,66 +743,139 @@ faster subsequent invokations
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit_transform"> <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit_transform">
<span class="sig-name descname"><span class="pre">fit_transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit_transform" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">fit_transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit_transform" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Fits the transform on <cite>X</cite> and transforms it.</p>
<dl class="field-list simple">
<dl class="py method"> <dt class="field-odd">Parameters</dt>
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.index"> <dd class="field-odd"><ul class="simple">
<span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">documents</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.index" title="Permalink to this definition"></a></dt> <li><p><strong>X</strong> a list of strings</p></li>
<dd></dd></dl> <li><p><strong>n_jobs</strong> the number of parallel workers to carry out this task</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.transform"> <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.transform">
<span class="sig-name descname"><span class="pre">transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.transform" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.transform" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Transforms the strings in <cite>X</cite> as lists of numerical ids</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>X</strong> a list of strings</p></li>
<li><p><strong>n_jobs</strong> the number of parallel workers to carry out this task</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
</dd>
</dl>
</dd></dl>
<dl class="py method"> <dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.vocabulary_size"> <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.vocabulary_size">
<span class="sig-name descname"><span class="pre">vocabulary_size</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.vocabulary_size" title="Permalink to this definition"></a></dt> <span class="sig-name descname"><span class="pre">vocabulary_size</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.vocabulary_size" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Gets the length of the vocabulary according to which the document tokens have been indexed</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
</dd></dl> </dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.index"> <dt class="sig sig-object py" id="quapy.data.preprocessing.index">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.index" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.index" title="Permalink to this definition"></a></dt>
<dd><p>Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index. <dd><p>Indexes the tokens of a textual <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of string documents.
Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK To index a document means to replace each different token by a unique numerical index.
:param dataset: a Dataset where the instances are lists of str Rare words (i.e., words occurring less than <cite>min_df</cite> times) are replaced by a special token <cite>UNK</cite></p>
:param min_df: minimum number of instances below which the term is replaced by a UNK index <dl class="field-list simple">
:param inplace: whether or not to apply the transformation inplace, or to a new copy <dt class="field-odd">Parameters</dt>
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer) <dd class="field-odd"><ul class="simple">
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) <li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object where the instances of training and test documents
consisting of lists of integer values representing indices.</p> are lists of str</p></li>
<li><p><strong>min_df</strong> minimum number of occurrences below which the term is replaced by a <cite>UNK</cite> index</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
<li><p><strong>kwargs</strong> the rest of parameters of the transformation (as for sklearns</p></li>
</ul>
</dd>
</dl>
<p><cite>CountVectorizer &lt;https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html&gt;_</cite>)
:return: a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current</p>
<blockquote>
<div><p><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) consisting of lists of integer values representing indices.</p>
</div></blockquote>
</dd></dl> </dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.reduce_columns"> <dt class="sig sig-object py" id="quapy.data.preprocessing.reduce_columns">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">reduce_columns</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.reduce_columns" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">reduce_columns</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.reduce_columns" title="Permalink to this definition"></a></dt>
<dd><p>Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least <dd><p>Reduces the dimensionality of the instances, represented as a <cite>csr_matrix</cite> (or any subtype of
_min_df_ instances <cite>scipy.sparse.spmatrix</cite>), of training and test documents by removing the columns of words which are not present
:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix) in at least <cite>min_df</cite> instances in the training set</p>
:param min_df: minimum number of instances below which the columns are removed <dl class="field-list simple">
:param inplace: whether or not to apply the transformation inplace, or to a new copy <dt class="field-odd">Parameters</dt>
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) <dd class="field-odd"><ul class="simple">
where the dimensions corresponding to infrequent instances have been removed</p> <li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in which instances are represented in sparse format (any
subtype of scipy.sparse.spmatrix)</p></li>
<li><p><strong>min_df</strong> integer, minimum number of instances below which the columns are removed</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) where the dimensions corresponding to infrequent terms
in the training set have been removed</p>
</dd>
</dl>
</dd></dl> </dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.standardize"> <dt class="sig sig-object py" id="quapy.data.preprocessing.standardize">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Standardizes the real-valued columns of a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>.
Standardization, aka z-scoring, of a variable <cite>X</cite> comes down to subtracting the average and normalizing by the
standard deviation.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object</p></li>
<li><p><strong>inplace</strong> set to True if the transformation is to be applied inplace, or to False (default) if a new
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> is to be returned</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p></p>
</dd>
</dl>
</dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.text2tfidf"> <dt class="sig sig-object py" id="quapy.data.preprocessing.text2tfidf">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">text2tfidf</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sublinear_tf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.text2tfidf" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">text2tfidf</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sublinear_tf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.text2tfidf" title="Permalink to this definition"></a></dt>
<dd><p>Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors <dd><p>Transforms a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of textual instances into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of
:param dataset: a Dataset where the instances are lists of str tfidf weighted sparse vectors</p>
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary <dl class="field-list simple">
:param sublinear_tf: whether or not to apply the log scalling to the tf counters <dt class="field-odd">Parameters</dt>
:param inplace: whether or not to apply the transformation inplace, or to a new copy <dd class="field-odd"><ul class="simple">
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer) <li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> where the instances of training and test collections are
:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True) lists of str</p></li>
where the instances are stored in a csr_matrix of real-valued tfidf scores</p> <li><p><strong>min_df</strong> minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)</p></li>
<li><p><strong>sublinear_tf</strong> whether or not to apply the log scalling to the tf counters (default True)</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
<li><p><strong>kwargs</strong> the rest of parameters of the transformation (as for sklearns
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html">TfidfVectorizer</a>)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in <cite>csr_matrix</cite> format (if inplace=False) or a reference to the
current Dataset (if inplace=True) where the instances are stored in a <cite>csr_matrix</cite> of real-valued tfidf scores</p>
</dd>
</dl>
</dd></dl> </dd></dl>
</section> </section>
@ -367,7 +884,24 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.binarize"> <dt class="sig sig-object py" id="quapy.data.reader.binarize">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">binarize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pos_class</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.binarize" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">binarize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pos_class</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.binarize" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Binarizes a categorical array-like collection of labels towards the positive class <cite>pos_class</cite>. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">binarize</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">array</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">])</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>y</strong> array-like of labels</p></li>
<li><p><strong>pos_class</strong> integer, the positive class</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a binary np.ndarray, in which values 1 corresponds to positions in whcih <cite>y</cite> had <cite>pos_class</cite> labels, and
0 otherwise</p>
</dd>
</dl>
</dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.from_csv"> <dt class="sig sig-object py" id="quapy.data.reader.from_csv">
@ -376,10 +910,13 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
File format &lt;label&gt;,&lt;feat1&gt;,&lt;feat2&gt;,…,&lt;featn&gt;</p> File format &lt;label&gt;,&lt;feat1&gt;,&lt;feat2&gt;,…,&lt;featn&gt;</p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>path</strong> path to the csv file</p> <dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> path to the csv file</p></li>
<li><p><strong>encoding</strong> the text encoding used to open the file</p></li>
</ul>
</dd> </dd>
<dt class="field-even">Returns</dt> <dt class="field-even">Returns</dt>
<dd class="field-even"><p>a ndarray for the labels and a ndarray (float) for the covariates</p> <dd class="field-even"><p>a np.ndarray for the labels and a ndarray (float) for the covariates</p>
</dd> </dd>
</dl> </dl>
</dd></dl> </dd></dl>
@ -394,7 +931,7 @@ File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
<dd class="field-odd"><p><strong>path</strong> path to the labelled collection</p> <dd class="field-odd"><p><strong>path</strong> path to the labelled collection</p>
</dd> </dd>
<dt class="field-even">Returns</dt> <dt class="field-even">Returns</dt>
<dd class="field-even"><p>a csr_matrix containing the instances (rows), and a ndarray containing the labels</p> <dd class="field-even"><p>a <cite>csr_matrix</cite> containing the instances (rows), and a ndarray containing the labels</p>
</dd> </dd>
</dl> </dl>
</dd></dl> </dd></dl>
@ -406,7 +943,11 @@ File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
File fomart &lt;0 or 1&gt; &lt;document&gt;</p> File fomart &lt;0 or 1&gt; &lt;document&gt;</p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>path</strong> path to the labelled collection</p> <dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> path to the labelled collection</p></li>
<li><p><strong>encoding</strong> the text encoding used to open the file</p></li>
<li><p><strong>verbose</strong> if &gt;0 (default) shows some progress information in standard output</p></li>
</ul>
</dd> </dd>
<dt class="field-even">Returns</dt> <dt class="field-even">Returns</dt>
<dd class="field-even"><p>a list of sentences, and a list of labels</p> <dd class="field-even"><p>a list of sentences, and a list of labels</p>
@ -418,9 +959,19 @@ File fomart &lt;0 or 1&gt; &lt;document&gt;</p>
<dt class="sig sig-object py" id="quapy.data.reader.reindex_labels"> <dt class="sig sig-object py" id="quapy.data.reader.reindex_labels">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">reindex_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.reindex_labels" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">reindex_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.reindex_labels" title="Permalink to this definition"></a></dt>
<dd><p>Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes. <dd><p>Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
E.g., y=[B, B, A, C] -&gt; [1,1,0,2], [A,B,C] E.g.:</p>
:param y: the list or array of original labels <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">reindex_labels</span><span class="p">([</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">])</span>
:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p> <span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">]),</span> <span class="n">array</span><span class="p">([</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">&#39;&lt;U1&#39;</span><span class="p">))</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>y</strong> the list or array of original labels</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
</dd>
</dl>
</dd></dl> </dd></dl>
</section> </section>

View File

@ -515,14 +515,20 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.evaluation.artificial_prevalence_prediction"> <dt class="sig sig-object py" id="quapy.evaluation.artificial_prevalence_prediction">
<span class="sig-prename descclassname"><span class="pre">quapy.evaluation.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_prediction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.method.html#quapy.method.base.BaseQuantifier" title="quapy.method.base.BaseQuantifier"><span class="pre">quapy.method.base.BaseQuantifier</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.data.html#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevpoints</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">210</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_repetitions</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eval_budget</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">42</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.evaluation.artificial_prevalence_prediction" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.evaluation.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_prediction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.method.html#quapy.method.base.BaseQuantifier" title="quapy.method.base.BaseQuantifier"><span class="pre">quapy.method.base.BaseQuantifier</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.data.html#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevpoints</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">210</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_repetitions</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eval_budget</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">42</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.evaluation.artificial_prevalence_prediction" title="Permalink to this definition"></a></dt>
<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol. <dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.</p>
:param model: the model in charge of generating the class prevalence estimations <dl class="field-list simple">
:param test: the test set on which to perform arificial sampling <dt class="field-odd">Parameters</dt>
:param sample_size: the size of the samples <dd class="field-odd"><ul class="simple">
:param n_prevpoints: the number of different prevalences to sample (or set to None if eval_budget is specified) <li><p><strong>model</strong> the model in charge of generating the class prevalence estimations</p></li>
:param n_repetitions: the number of repetitions for each prevalence <li><p><strong>test</strong> the test set on which to perform arificial sampling</p></li>
:param eval_budget: if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3 <li><p><strong>sample_size</strong> the size of the samples</p></li>
classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15 <li><p><strong>n_prevpoints</strong> the number of different prevalences to sample (or set to None if eval_budget is specified)</p></li>
<li><p><strong>n_repetitions</strong> the number of repetitions for each prevalence</p></li>
<li><p><strong>eval_budget</strong> if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3</p></li>
</ul>
</dd>
</dl>
<p>classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
different prevalences ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] … [1, 0, 0]) and since setting it n_prevpoints different prevalences ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] … [1, 0, 0]) and since setting it n_prevpoints
to 6 would produce more than 20 evaluations. to 6 would produce more than 20 evaluations.
:param n_jobs: number of jobs to be run in parallel :param n_jobs: number of jobs to be run in parallel
@ -601,7 +607,31 @@ contains the the prevalence estimations</p>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.artificial_prevalence_sampling"> <dt class="sig sig-object py" id="quapy.functional.artificial_prevalence_sampling">
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dimensions</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_constrained_dim</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.artificial_prevalence_sampling" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dimensions</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_constrained_dim</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.artificial_prevalence_sampling" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Generates vectors of prevalence values artificially drawn from an exhaustive grid of prevalence values. The
number of prevalence values explored for each dimension depends on <cite>n_prevalences</cite>, so that, if, for example,
<cite>n_prevalences=11</cite> then the prevalence values of the grid are taken from [0, 0.1, 0.2, …, 0.9, 1]. Only
valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
valid vector of prevalence values, <cite>repeat</cite> copies are returned. The vector of prevalence values can be
implicit (by setting <cite>return_constrained_dim=False</cite>), meaning that the last dimension (which is constrained
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to 1).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dimensions</strong> the number of classes</p></li>
<li><p><strong>n_prevalences</strong> the number of equidistant prevalence points to extract from the [0,1] interval for the grid
(default is 21)</p></li>
<li><p><strong>repeat</strong> number of copies for each valid prevalence vector (default is 1)</p></li>
<li><p><strong>return_constrained_dim</strong> set to True to return all dimensions, or to False (default) for ommitting the
constrained dimension</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an ndarray of shape <cite>(n, dimensions)</cite> if <cite>return_constrained_dim=True</cite> or of shape <cite>(n, dimensions-1)</cite>
if <cite>return_constrained_dim=False</cite>, where <cite>n</cite> is the number of valid combinations found in the grid multiplied
by <cite>repeat</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.get_nprevpoints_approximation"> <dt class="sig sig-object py" id="quapy.functional.get_nprevpoints_approximation">
@ -634,8 +664,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.prevalence_from_labels"> <dt class="sig sig-object py" id="quapy.functional.prevalence_from_labels">
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition"></a></dt>
<dd></dd></dl> <dd><p>Computed the prevalence values from a vector of labels.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>labels</strong> array-like of shape <cite>(n_instances)</cite> with the label for each instance</p></li>
<li><p><strong>classes</strong> the class labels. This is needed in order to correctly compute the prevalence vector even when
some classes have no examples.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an ndarray of shape <cite>(len(classes))</cite> with the class prevalence values</p>
</dd>
</dl>
</dd></dl>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.prevalence_from_probabilities"> <dt class="sig sig-object py" id="quapy.functional.prevalence_from_probabilities">
@ -645,13 +688,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.prevalence_linspace"> <dt class="sig sig-object py" id="quapy.functional.prevalence_linspace">
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_linspace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smooth_limits_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.01</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_linspace" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_linspace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smooth_limits_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.01</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_linspace" title="Permalink to this definition"></a></dt>
<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05 <dd><p>Produces a uniformly separated values of prevalence. By default, produces an array of 21 prevalence values, with
and with the limits smoothed, i.e.: step 0.05 and with the limits smoothed, i.e.:
[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99] [0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]</p>
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21) <dl class="field-list simple">
:param repeat: number of times each prevalence is to be repeated (defaults to 1) <dt class="field-odd">Parameters</dt>
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 <dd class="field-odd"><ul class="simple">
:return: an array of uniformly separated prevalence values</p> <li><p><strong>n_prevalences</strong> the number of prevalence values to sample from the [0,1] interval (default 21)</p></li>
<li><p><strong>repeat</strong> number of times each prevalence is to be repeated (defaults to 1)</p></li>
<li><p><strong>smooth_limits_epsilon</strong> the quantity to add and subtract to the limits 0 and 1</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an array of uniformly separated prevalence values</p>
</dd>
</dl>
</dd></dl> </dd></dl>
<dl class="py function"> <dl class="py function">

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,3 @@
from abc import abstractmethod
from typing import List, Union
import numpy as np import numpy as np
from scipy.sparse import issparse from scipy.sparse import issparse
from scipy.sparse import vstack from scipy.sparse import vstack
@ -9,18 +6,19 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from quapy.functional import artificial_prevalence_sampling, strprev from quapy.functional import artificial_prevalence_sampling, strprev
class LabelledCollection: class LabelledCollection:
''' """
A LabelledCollection is a set of objects each with a label associated to it. A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
''' routines.
:param instances: array-like (np.ndarray, list, or csr_matrix are supported)
:param labels: array-like with the same length of instances
:param classes_: optional, list of classes from which labels are taken. If not specified, the classes are inferred
from the labels. The classes must be indicated in cases in which some of the labels might have no examples
(i.e., a prevalence of 0)
"""
def __init__(self, instances, labels, classes_=None): def __init__(self, instances, labels, classes_=None):
"""
:param instances: list of objects
:param labels: list of labels, same length of instances
:param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
"""
if issparse(instances): if issparse(instances):
self.instances = instances self.instances = instances
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str): elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
@ -42,28 +40,81 @@ class LabelledCollection:
@classmethod @classmethod
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs): def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
"""
Loads a labelled set of data and convert it into a :class:`LabelledCollection` instance. The function in charge
of reading the instances must be specified. This function can be a custom one, or any of the reading functions
defined in :mod:`quapy.data.reader` module.
:param path: string, the path to the file containing the labelled instances
:param loader_func: a custom function that implements the data loader and returns a tuple with instances and
labels
:param classes: array-like, the classes according to which the instances are labelled
:param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances, i.e.,
these arguments are used to call `loader_func(path, **loader_kwargs)`
:return: a :class:`LabelledCollection` object
"""
return LabelledCollection(*loader_func(path, **loader_kwargs), classes) return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
def __len__(self): def __len__(self):
"""
Returns the length of this collection (number of labelled instances)
:return: integer
"""
return self.instances.shape[0] return self.instances.shape[0]
def prevalence(self): def prevalence(self):
"""
Returns the prevalence, or relative frequency, of the classes of interest.
:return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order
as listed by `self.classes_`
"""
return self.counts() / len(self) return self.counts() / len(self)
def counts(self): def counts(self):
"""
Returns the number of instances for each of the classes of interest.
:return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order
as listed by `self.classes_`
"""
return np.asarray([len(self.index[class_]) for class_ in self.classes_]) return np.asarray([len(self.index[class_]) for class_ in self.classes_])
@property @property
def n_classes(self): def n_classes(self):
"""
The number of classes
:return: integer
"""
return len(self.classes_) return len(self.classes_)
@property @property
def binary(self): def binary(self):
"""
Returns True if the number of classes is 2
:return: boolean
"""
return self.n_classes == 2 return self.n_classes == 2
def sampling_index(self, size, *prevs, shuffle=True): def sampling_index(self, size, *prevs, shuffle=True):
"""
Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling.
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.
:param size: integer, the requested size
:param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
`self.classes_` can be specified, while the other class takes prevalence value `1-p`
:param shuffle: if set to True (default), shuffles the index before returning it
:return: a np.ndarray of shape `(size)` with the indexes
"""
if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling
return np.random.choice(len(self), size, replace=False) return self.uniform_sampling_index(size)
if len(prevs) == self.n_classes - 1: if len(prevs) == self.n_classes - 1:
prevs = prevs + (1 - sum(prevs),) prevs = prevs + (1 - sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
@ -93,47 +144,142 @@ class LabelledCollection:
return indexes_sample return indexes_sample
def uniform_sampling_index(self, size): def uniform_sampling_index(self, size):
"""
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the size of the uniform sample
:return: a np.ndarray of shape `(size)` with the indexes
"""
return np.random.choice(len(self), size, replace=False) return np.random.choice(len(self), size, replace=False)
def uniform_sampling(self, size):
unif_index = self.uniform_sampling_index(size)
return self.sampling_from_index(unif_index)
def sampling(self, size, *prevs, shuffle=True): def sampling(self, size, *prevs, shuffle=True):
"""
Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.
:param size: integer, the requested size
:param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
`self.classes_` can be specified, while the other class takes prevalence value `1-p`
:param shuffle: if set to True (default), shuffles the index before returning it
:return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or
prevalence == `prevs` if the exact prevalence values can be met as proportions of instances)
"""
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle) prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
return self.sampling_from_index(prev_index) return self.sampling_from_index(prev_index)
def uniform_sampling(self, size):
"""
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the requested size
:return: an instance of :class:`LabelledCollection` with length == `size`
"""
unif_index = self.uniform_sampling_index(size)
return self.sampling_from_index(unif_index)
def sampling_from_index(self, index): def sampling_from_index(self, index):
"""
Returns an instance of :class:`LabelledCollection` whose elements are sampled from this collection using the
index.
:param index: np.ndarray
:return: an instance of :class:`LabelledCollection`
"""
documents = self.instances[index] documents = self.instances[index]
labels = self.labels[index] labels = self.labels[index]
return LabelledCollection(documents, labels, classes_=self.classes_) return LabelledCollection(documents, labels, classes_=self.classes_)
def split_stratified(self, train_prop=0.6, random_state=None): def split_stratified(self, train_prop=0.6, random_state=None):
# with temp_seed(42): """
Returns two instances of :class:`LabelledCollection` split with stratification from this collection, at desired
proportion.
:param train_prop: the proportion of elements to include in the left-most returned collection (typically used
as the training collection). The rest of elements are included in the right-most returned collection
(typically used as a test collection).
:param random_state: if specified, guarantees reproducibility of the split.
:return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
second one with `1-train_prop` elements
"""
tr_docs, te_docs, tr_labels, te_labels = \ tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
random_state=random_state) random_state=random_state)
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
"""
A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
combination of prevalence values is indicated by `repeats`
:param sample_size: the number of instances in each sample
:param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
:param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
:return: yield samples generated at artificially controlled prevalence values
"""
dimensions = self.n_classes dimensions = self.n_classes
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling(sample_size, *prevs) yield self.sampling(sample_size, *prevs)
def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1): def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
"""
A generator of sample indexes implementing the artificial prevalence protocol (APP).
The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of sample indexes for each valid
combination of prevalence values is indicated by `repeats`
:param sample_size: the number of instances in each sample (i.e., length of each index)
:param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
:param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
:return: yield the indexes that generate the samples according to APP
"""
dimensions = self.n_classes dimensions = self.n_classes
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling_index(sample_size, *prevs) yield self.sampling_index(sample_size, *prevs)
def natural_sampling_generator(self, sample_size, repeats=100): def natural_sampling_generator(self, sample_size, repeats=100):
"""
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate
:return: yield instances of :class:`LabelledCollection`
"""
for _ in range(repeats): for _ in range(repeats):
yield self.uniform_sampling(sample_size) yield self.uniform_sampling(sample_size)
def natural_sampling_index_generator(self, sample_size, repeats=100): def natural_sampling_index_generator(self, sample_size, repeats=100):
"""
A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
:param sample_size: integer, the number of instances in each sample (i.e., the length of each index)
:param repeats: the number of indexes to generate
:return: yield `repeats` instances of np.ndarray with shape `(sample_size,)`
"""
for _ in range(repeats): for _ in range(repeats):
yield self.uniform_sampling_index(sample_size) yield self.uniform_sampling_index(sample_size)
def __add__(self, other): def __add__(self, other):
"""
Returns a new :class:`LabelledCollection` as the union of this collection with another collection
:param other: another :class:`LabelledCollection`
:return: a :class:`LabelledCollection` representing the union of both collections
"""
if other is None: if other is None:
return self return self
elif issparse(self.instances) and issparse(other.instances): elif issparse(self.instances) and issparse(other.instances):
@ -149,9 +295,29 @@ class LabelledCollection:
@property @property
def Xy(self): def Xy(self):
"""
Gets the instances and labels. This is useful when working with `sklearn` estimators, e.g.:
>>> svm = LinearSVC().fit(*my_collection.Xy)
:return: a tuple `(instances, labels)` from this collection
"""
return self.instances, self.labels return self.instances, self.labels
def stats(self, show=True): def stats(self, show=True):
"""
Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
>>> data.training.stats()
>>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]
:param show: if set to True (default), prints the stats in standard output
:return: a dictionary containing some stats of this collection. Keys include `#instances` (the number of
instances), `type` (the type representing the instances), `#features` (the number of features, if the
instances are in array-like format), `#classes` (the classes of the collection), `prevs` (the prevalence
values for each class)
"""
ninstances = len(self) ninstances = len(self)
instance_type = type(self.instances[0]) instance_type = type(self.instances[0])
if instance_type == list: if instance_type == list:
@ -171,6 +337,14 @@ class LabelledCollection:
return stats_ return stats_
def kFCV(self, nfolds=5, nrepeats=1, random_state=0): def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
"""
Generator of stratified folds to be used in k-fold cross validation.
:param nfolds: integer (default 5), the number of folds to generate
:param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
:param random_state: integer (default 0), guarantees that the folds generated are reproducible
:return: yields `nfolds * nrepeats` folds for k-fold cross validation
"""
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
for train_index, test_index in kf.split(*self.Xy): for train_index, test_index in kf.split(*self.Xy):
train = self.sampling_from_index(train_index) train = self.sampling_from_index(train_index)
@ -178,8 +352,15 @@ class LabelledCollection:
yield train, test yield train, test
class Dataset: class Dataset:
"""
Abstraction of training and test :class:`LabelledCollection` objects.
:param training: a :class:`LabelledCollection` instance
:param test: a :class:`LabelledCollection` instance
:param vocabulary: if indicated, is a dictionary of the terms used in this textual dataset
:param name: a string representing the name of the dataset
"""
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''): def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections' assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
@ -190,45 +371,118 @@ class Dataset:
@classmethod @classmethod
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6): def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
"""
Generates a :class:`Dataset` from a stratified split of a :class:`LabelledCollection` instance.
See :meth:`LabelledCollection.split_stratified`
:param collection: :class:`LabelledCollection`
:param train_size: the proportion of training documents (the rest conforms the test split)
:return: an instance of :class:`Dataset`
"""
return Dataset(*collection.split_stratified(train_prop=train_size)) return Dataset(*collection.split_stratified(train_prop=train_size))
@property @property
def classes_(self): def classes_(self):
"""
The classes according to which the training collection is labelled
:return: The classes according to which the training collection is labelled
"""
return self.training.classes_ return self.training.classes_
@property @property
def n_classes(self): def n_classes(self):
"""
The number of classes according to which the training collection is labelled
:return: integer
"""
return self.training.n_classes return self.training.n_classes
@property @property
def binary(self): def binary(self):
"""
Returns True if the training collection is labelled according to two classes
:return: boolean
"""
return self.training.binary return self.training.binary
@classmethod @classmethod
def load(cls, train_path, test_path, loader_func: callable): def load(cls, train_path, test_path, loader_func: callable, classes=None, **loader_kwargs):
training = LabelledCollection.load(train_path, loader_func) """
test = LabelledCollection.load(test_path, loader_func) Loads a training and a test labelled set of data and convert it into a :class:`Dataset` instance.
The function in charge of reading the instances must be specified. This function can be a custom one, or any of
the reading functions defined in :mod:`quapy.data.reader` module.
:param train_path: string, the path to the file containing the training instances
:param test_path: string, the path to the file containing the test instances
:param loader_func: a custom function that implements the data loader and returns a tuple with instances and
labels
:param classes: array-like, the classes according to which the instances are labelled
:param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances.
See :meth:`LabelledCollection.load` for further details.
:return: a :class:`Dataset` object
"""
training = LabelledCollection.load(train_path, loader_func, classes, **loader_kwargs)
test = LabelledCollection.load(test_path, loader_func, classes, **loader_kwargs)
return Dataset(training, test) return Dataset(training, test)
@property @property
def vocabulary_size(self): def vocabulary_size(self):
"""
If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary
:return: integer
"""
return len(self.vocabulary) return len(self.vocabulary)
def stats(self): def stats(self, show):
"""
Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
>>> data.stats()
>>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]
:param show: if set to True (default), prints the stats in standard output
:return: a dictionary containing some stats of this collection for the training and test collections. The keys
are `train` and `test`, and point to dedicated dictionaries of stats, for each collection, with keys
`#instances` (the number of instances), `type` (the type representing the instances),
`#features` (the number of features, if the instances are in array-like format), `#classes` (the classes of
the collection), `prevs` (the prevalence values for each class)
"""
tr_stats = self.training.stats(show=False) tr_stats = self.training.stats(show=False)
te_stats = self.test.stats(show=False) te_stats = self.test.stats(show=False)
print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, ' if show:
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
return {'train': tr_stats, 'test': te_stats} return {'train': tr_stats, 'test': te_stats}
@classmethod @classmethod
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0): def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
"""
Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
:meth:`LabelledCollection.kFCV` that returns :class:`Dataset` instances made of training and test folds.
:param nfolds: integer (default 5), the number of folds to generate
:param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
:param random_state: integer (default 0), guarantees that the folds generated are reproducible
:return: yields `nfolds * nrepeats` folds for k-fold cross validation as instances of :class:`Dataset`
"""
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)): for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})') yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
def isbinary(data): def isbinary(data):
"""
Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
:param data: a :class:`Dataset` or a :class:`LabelledCollection` object
:return: True if labelled according to two classes
"""
if isinstance(data, Dataset) or isinstance(data, LabelledCollection): if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
return data.binary return data.binary
return False return False

View File

@ -5,9 +5,6 @@ warnings.warn = warn
import os import os
import zipfile import zipfile
from os.path import join from os.path import join
from urllib.error import HTTPError
from sklearn.model_selection import StratifiedKFold
import pandas as pd import pandas as pd
from quapy.data.base import Dataset, LabelledCollection from quapy.data.base import Dataset, LabelledCollection
@ -49,18 +46,20 @@ UCI_DATASETS = ['acute.a', 'acute.b',
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset: def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
""" """
Load a Reviews dataset as a Dataset instance, as used in: Loads a Reviews dataset as a Dataset instance, as used in
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
:param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb' :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
:param min_df: minimun number of documents that should contain a term in order for the term to be :param min_df: minimun number of documents that should contain a term in order for the term to be
kept (ignored if tfidf==False) kept (ignored if tfidf==False)
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory) ~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations faster subsequent invokations
:return: a Dataset instance :return: a :class:`quapy.data.base.Dataset` instance
""" """
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \ assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \ f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
@ -93,22 +92,25 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset: def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
""" """
Load a Twitter dataset as a Dataset instance, as used in: Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016) Social Network Analysis and Mining6(19), 122 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
The datasets 'semeval13', 'semeval14', 'semeval15' share the same training set. Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
The list of valid dataset names corresponding to training sets can be accessed in
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`
:param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13', :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb' 'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
:param for_model_selection: if True, then returns the train split as the training set and the devel split :param for_model_selection: if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set test set
:param min_df: minimun number of documents that should contain a term in order for the term to be kept :param min_df: minimun number of documents that should contain a term in order for the term to be kept
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory) ~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations faster subsequent invokations
:return: a Dataset instance :return: a :class:`quapy.data.base.Dataset` instance
""" """
assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \ assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \ f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
@ -163,11 +165,58 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
"""
Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
and
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
information on how to use these collections), and so a train-test split is generated at desired proportion.
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
"""
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose) data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0)) return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset: def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
"""
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
and
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
>>> import quapy as qp
>>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
>>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
"""
assert dataset_name in UCI_DATASETS, \ assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
@ -302,7 +351,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t') df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False) df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
[df_replace(df, col) for col in range(1, 6)] [_df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values X = df.loc[:, 0:5].values
if dataset_name == 'acute.a': if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes') y = binarize(df[6], pos_class='yes')
@ -482,5 +531,5 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
return data return data
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)

View File

@ -12,14 +12,18 @@ from .base import LabelledCollection
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
""" """
Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of
:param dataset: a Dataset where the instances are lists of str tfidf weighted sparse vectors
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
:param sublinear_tf: whether or not to apply the log scalling to the tf counters :param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are
:param inplace: whether or not to apply the transformation inplace, or to a new copy lists of str
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer) :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)
:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True) :param sublinear_tf: whether or not to apply the log scalling to the tf counters (default True)
where the instances are stored in a csr_matrix of real-valued tfidf scores :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
:param kwargs: the rest of parameters of the transformation (as for sklearn's
`TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_)
:return: a new :class:`quapy.data.base.Dataset` in `csr_matrix` format (if inplace=False) or a reference to the
current Dataset (if inplace=True) where the instances are stored in a `csr_matrix` of real-valued tfidf scores
""" """
__check_type(dataset.training.instances, np.ndarray, str) __check_type(dataset.training.instances, np.ndarray, str)
__check_type(dataset.test.instances, np.ndarray, str) __check_type(dataset.test.instances, np.ndarray, str)
@ -41,13 +45,17 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
def reduce_columns(dataset: Dataset, min_df=5, inplace=False): def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
""" """
Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least Reduces the dimensionality of the instances, represented as a `csr_matrix` (or any subtype of
_min_df_ instances `scipy.sparse.spmatrix`), of training and test documents by removing the columns of words which are not present
:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix) in at least `min_df` instances in the training set
:param min_df: minimum number of instances below which the columns are removed
:param inplace: whether or not to apply the transformation inplace, or to a new copy :param dataset: a :class:`quapy.data.base.Dataset` in which instances are represented in sparse format (any
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) subtype of scipy.sparse.spmatrix)
where the dimensions corresponding to infrequent instances have been removed :param min_df: integer, minimum number of instances below which the columns are removed
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
:return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
:class:`quapy.data.base.Dataset` (inplace=True) where the dimensions corresponding to infrequent terms
in the training set have been removed
""" """
__check_type(dataset.training.instances, spmatrix) __check_type(dataset.training.instances, spmatrix)
__check_type(dataset.test.instances, spmatrix) __check_type(dataset.test.instances, spmatrix)
@ -71,7 +79,17 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
return Dataset(training, test) return Dataset(training, test)
def standardize(dataset: Dataset, inplace=True): def standardize(dataset: Dataset, inplace=False):
"""
Standardizes the real-valued columns of a :class:`quapy.data.base.Dataset`.
Standardization, aka z-scoring, of a variable `X` comes down to subtracting the average and normalizing by the
standard deviation.
:param dataset: a :class:`quapy.data.base.Dataset` object
:param inplace: set to True if the transformation is to be applied inplace, or to False (default) if a new
:class:`quapy.data.base.Dataset` is to be returned
:return:
"""
s = StandardScaler(copy=not inplace) s = StandardScaler(copy=not inplace)
training = s.fit_transform(dataset.training.instances) training = s.fit_transform(dataset.training.instances)
test = s.transform(dataset.test.instances) test = s.transform(dataset.test.instances)
@ -83,14 +101,18 @@ def standardize(dataset: Dataset, inplace=True):
def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
""" """
Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index. Indexes the tokens of a textual :class:`quapy.data.base.Dataset` of string documents.
Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK To index a document means to replace each different token by a unique numerical index.
:param dataset: a Dataset where the instances are lists of str Rare words (i.e., words occurring less than `min_df` times) are replaced by a special token `UNK`
:param min_df: minimum number of instances below which the term is replaced by a UNK index
:param inplace: whether or not to apply the transformation inplace, or to a new copy :param dataset: a :class:`quapy.data.base.Dataset` object where the instances of training and test documents
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer) are lists of str
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) :param min_df: minimum number of occurrences below which the term is replaced by a `UNK` index
consisting of lists of integer values representing indices. :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
:param kwargs: the rest of parameters of the transformation (as for sklearn's
`CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_`)
:return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
:class:`quapy.data.base.Dataset` (inplace=True) consisting of lists of integer values representing indices.
""" """
__check_type(dataset.training.instances, np.ndarray, str) __check_type(dataset.training.instances, np.ndarray, str)
__check_type(dataset.test.instances, np.ndarray, str) __check_type(dataset.test.instances, np.ndarray, str)
@ -120,17 +142,23 @@ def __check_type(container, container_type=None, element_type=None):
class IndexTransformer: class IndexTransformer:
"""
This class implements a sklearn's-style transformer that indexes text as numerical ids for the tokens it
contains, and that would be generated by sklearn's
`CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
:param kwargs: keyworded arguments from `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
"""
def __init__(self, **kwargs): def __init__(self, **kwargs):
"""
:param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
"""
self.vect = CountVectorizer(**kwargs) self.vect = CountVectorizer(**kwargs)
self.unk = -1 # a valid index is assigned after fit self.unk = -1 # a valid index is assigned after fit
self.pad = -2 # a valid index is assigned after fit self.pad = -2 # a valid index is assigned after fit
def fit(self, X): def fit(self, X):
""" """
Fits the transformer, i.e., decides on the vocabulary, given a list of strings.
:param X: a list of strings :param X: a list of strings
:return: self :return: self
""" """
@ -142,22 +170,52 @@ class IndexTransformer:
return self return self
def transform(self, X, n_jobs=-1): def transform(self, X, n_jobs=-1):
"""
Transforms the strings in `X` as lists of numerical ids
:param X: a list of strings
:param n_jobs: the number of parallel workers to carry out this task
:return: a `np.ndarray` of numerical ids
"""
# given the number of tasks and the number of jobs, generates the slices for the parallel processes # given the number of tasks and the number of jobs, generates the slices for the parallel processes
assert self.unk != -1, 'transform called before fit' assert self.unk != -1, 'transform called before fit'
indexed = map_parallel(func=self.index, args=X, n_jobs=n_jobs) indexed = map_parallel(func=self._index, args=X, n_jobs=n_jobs)
return np.asarray(indexed) return np.asarray(indexed)
def index(self, documents): def _index(self, documents):
vocab = self.vocabulary_.copy() vocab = self.vocabulary_.copy()
return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
def fit_transform(self, X, n_jobs=-1): def fit_transform(self, X, n_jobs=-1):
"""
Fits the transform on `X` and transforms it.
:param X: a list of strings
:param n_jobs: the number of parallel workers to carry out this task
:return: a `np.ndarray` of numerical ids
"""
return self.fit(X).transform(X, n_jobs=n_jobs) return self.fit(X).transform(X, n_jobs=n_jobs)
def vocabulary_size(self): def vocabulary_size(self):
"""
Gets the length of the vocabulary according to which the document tokens have been indexed
:return: integer
"""
return len(self.vocabulary_) return len(self.vocabulary_)
def add_word(self, word, id=None, nogaps=True): def add_word(self, word, id=None, nogaps=True):
"""
Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
Useful to define special tokens for codifying unknown words, or padding tokens.
:param word: string, surface form of the token
:param id: integer, numerical value to assign to the token (leave as None for indicating the next valid id,
default)
:param nogaps: if set to True (default) asserts that the id indicated leads to no numerical gaps with
precedent ids stored so far
:return: integer, the numerical id for the new token
"""
if word in self.vocabulary_: if word in self.vocabulary_:
raise ValueError(f'word {word} already in dictionary') raise ValueError(f'word {word} already in dictionary')
if id is None: if id is None:

View File

@ -7,7 +7,10 @@ def from_text(path, encoding='utf-8', verbose=1, class2int=True):
""" """
Reads a labelled colletion of documents. Reads a labelled colletion of documents.
File fomart <0 or 1>\t<document>\n File fomart <0 or 1>\t<document>\n
:param path: path to the labelled collection :param path: path to the labelled collection
:param encoding: the text encoding used to open the file
:param verbose: if >0 (default) shows some progress information in standard output
:return: a list of sentences, and a list of labels :return: a list of sentences, and a list of labels
""" """
all_sentences, all_labels = [], [] all_sentences, all_labels = [], []
@ -35,8 +38,9 @@ def from_sparse(path):
""" """
Reads a labelled collection of real-valued instances expressed in sparse format Reads a labelled collection of real-valued instances expressed in sparse format
File format <-1 or 0 or 1>[\s col(int):val(float)]\n File format <-1 or 0 or 1>[\s col(int):val(float)]\n
:param path: path to the labelled collection :param path: path to the labelled collection
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels :return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels
""" """
def split_col_val(col_val): def split_col_val(col_val):
@ -68,8 +72,10 @@ def from_csv(path, encoding='utf-8'):
""" """
Reads a csv file in which columns are separated by ','. Reads a csv file in which columns are separated by ','.
File format <label>,<feat1>,<feat2>,...,<featn>\n File format <label>,<feat1>,<feat2>,...,<featn>\n
:param path: path to the csv file :param path: path to the csv file
:return: a ndarray for the labels and a ndarray (float) for the covariates :param encoding: the text encoding used to open the file
:return: a np.ndarray for the labels and a ndarray (float) for the covariates
""" """
X, y = [], [] X, y = [], []
@ -85,11 +91,16 @@ def from_csv(path, encoding='utf-8'):
def reindex_labels(y): def reindex_labels(y):
""" """
Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes. Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
E.g., y=['B', 'B', 'A', 'C'] -> [1,1,0,2], ['A','B','C'] E.g.:
>>> reindex_labels(['B', 'B', 'A', 'C'])
>>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))
:param y: the list or array of original labels :param y: the list or array of original labels
:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes. :return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.
""" """
classnames = sorted(np.unique(y)) y = np.asarray(y)
classnames = np.asarray(sorted(np.unique(y)))
label2index = {label: index for index, label in enumerate(classnames)} label2index = {label: index for index, label in enumerate(classnames)}
indexed = np.empty(y.shape, dtype=np.int) indexed = np.empty(y.shape, dtype=np.int)
for label in classnames: for label in classnames:
@ -98,6 +109,17 @@ def reindex_labels(y):
def binarize(y, pos_class): def binarize(y, pos_class):
"""
Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,:
>>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)
>>> array([0, 1, 0, 0, 0, 0])
:param y: array-like of labels
:param pos_class: integer, the positive class
:return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and
0 otherwise
"""
y = np.asarray(y) y = np.asarray(y)
ybin = np.zeros(y.shape, dtype=np.int) ybin = np.zeros(y.shape, dtype=np.int)
ybin[y == pos_class] = 1 ybin[y == pos_class] = 1