updating the documentation

This commit is contained in:
Alejandro Moreo Fernandez 2021-12-06 18:25:47 +01:00
parent 1f591ec105
commit 2bd47f0841
9 changed files with 1197 additions and 218 deletions

View File

@ -255,12 +255,8 @@
<li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.device">(quapy.method.neural.QuaNetModule property)</a>
</li>
</ul></li>
<li><a href="quapy.data.html#quapy.data.datasets.df_replace">df_replace() (in module quapy.data.datasets)</a>
</li>
<li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.dimensions">dimensions() (quapy.classification.neural.TextClassifierNet method)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.classification.html#quapy.classification.neural.CNNnet.document_embedding">document_embedding() (quapy.classification.neural.CNNnet method)</a>
<ul>
@ -269,6 +265,8 @@
<li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.document_embedding">(quapy.classification.neural.TextClassifierNet method)</a>
</li>
</ul></li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.html#quapy.util.download_file">download_file() (in module quapy.util)</a>
</li>
<li><a href="quapy.html#quapy.util.download_file_if_not_exists">download_file_if_not_exists() (in module quapy.util)</a>
@ -462,19 +460,15 @@
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.data.html#quapy.data.preprocessing.index">index() (in module quapy.data.preprocessing)</a>
<ul>
<li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer.index">(quapy.data.preprocessing.IndexTransformer method)</a>
</li>
</ul></li>
<li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer">IndexTransformer (class in quapy.data.preprocessing)</a>
</li>
<li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.init_hidden">init_hidden() (quapy.method.neural.QuaNetModule method)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.method.html#quapy.method.base.isaggregative">isaggregative() (in module quapy.method.base)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="quapy.html#quapy.isbinary">isbinary() (in module quapy)</a>
<ul>

Binary file not shown.

View File

@ -63,45 +63,144 @@
<dt class="sig sig-object py" id="quapy.data.base.Dataset">
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">Dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">training</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">vocabulary</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Abstraction of training and test <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> objects.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>training</strong> a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
<li><p><strong>test</strong> a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
<li><p><strong>vocabulary</strong> if indicated, is a dictionary of the terms used in this textual dataset</p></li>
<li><p><strong>name</strong> a string representing the name of the dataset</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.SplitStratified">
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">SplitStratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">collection</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">train_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.SplitStratified" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Generates a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> from a stratified split of a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance.
See <a class="reference internal" href="#quapy.data.base.LabelledCollection.split_stratified" title="quapy.data.base.LabelledCollection.split_stratified"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.split_stratified()</span></code></a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>collection</strong> <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p></li>
<li><p><strong>train_size</strong> the proportion of training documents (the rest conforms the test split)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.binary">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.Dataset.binary" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns True if the training collection is labelled according to two classes</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>boolean</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.classes_">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.data.base.Dataset.classes_" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>The classes according to which the training collection is labelled</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>The classes according to which the training collection is labelled</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.kFCV">
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.kFCV" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
<a class="reference internal" href="#quapy.data.base.LabelledCollection.kFCV" title="quapy.data.base.LabelledCollection.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.kFCV()</span></code></a> that returns <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instances made of training and test folds.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>nfolds</strong> integer (default 5), the number of folds to generate</p></li>
<li><p><strong>nrepeats</strong> integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
<li><p><strong>random_state</strong> integer (default 0), guarantees that the folds generated are reproducible</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation as instances of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.load">
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition"></a></dt>
<dd><p>Loads a training and a test labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instance.
The function in charge of reading the instances must be specified. This function can be a custom one, or any of
the reading functions defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>train_path</strong> string, the path to the file containing the training instances</p></li>
<li><p><strong>test_path</strong> string, the path to the file containing the test instances</p></li>
<li><p><strong>loader_func</strong> a custom function that implements the data loader and returns a tuple with instances and
labels</p></li>
<li><p><strong>classes</strong> array-like, the classes according to which the instances are labelled</p></li>
<li><p><strong>loader_kwargs</strong> any argument that the <cite>loader_func</cite> function needs in order to read the instances.
See <a class="reference internal" href="#quapy.data.base.LabelledCollection.load" title="quapy.data.base.LabelledCollection.load"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.load()</span></code></a> for further details.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.n_classes">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.Dataset.n_classes" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>The number of classes according to which the training collection is labelled</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.stats">
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition"></a></dt>
<dd><p>Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Dataset</span><span class="o">=</span><span class="n">kindle</span> <span class="c1">#tr-instances=3821, #te-instances=21591, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>show</strong> if set to True (default), prints the stats in standard output</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a dictionary containing some stats of this collection for the training and test collections. The keys
are <cite>train</cite> and <cite>test</cite>, and point to dedicated dictionaries of stats, for each collection, with keys
<cite>#instances</cite> (the number of instances), <cite>type</cite> (the type representing the instances),
<cite>#features</cite> (the number of features, if the instances are in array-like format), <cite>#classes</cite> (the classes of
the collection), <cite>prevs</cite> (the prevalence values for each class)</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.vocabulary_size">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">vocabulary_size</span></span><a class="headerlink" href="#quapy.data.base.Dataset.vocabulary_size" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
@ -109,161 +208,480 @@
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection">
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">LabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>A LabelledCollection is a set of objects each with a label associated to it.</p>
<p>A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
routines.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>instances</strong> array-like (np.ndarray, list, or csr_matrix are supported)</p></li>
<li><p><strong>labels</strong> array-like with the same length of instances</p></li>
<li><p><strong>classes</strong> optional, list of classes from which labels are taken. If not specified, the classes are inferred
from the labels. The classes must be indicated in cases in which some of the labels might have no examples
(i.e., a prevalence of 0)</p></li>
</ul>
</dd>
</dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.Xy">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">Xy</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.Xy" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Gets the instances and labels. This is useful when working with <cite>sklearn</cite> estimators, e.g.:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span> <span class="o">=</span> <span class="n">LinearSVC</span><span class="p">()</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="o">*</span><span class="n">my_collection</span><span class="o">.</span><span class="n">Xy</span><span class="p">)</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>a tuple <cite>(instances, labels)</cite> from this collection</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_generator">
<span class="sig-name descname"><span class="pre">artificial_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_generator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
[1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of samples for each valid
combination of prevalence values is indicated by <cite>repeats</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>sample_size</strong> the number of instances in each sample</p></li>
<li><p><strong>n_prevalences</strong> the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
<li><p><strong>repeats</strong> the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yield samples generated at artificially controlled prevalence values</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_index_generator">
<span class="sig-name descname"><span class="pre">artificial_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_index_generator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>A generator of sample indexes implementing the artificial prevalence protocol (APP).
The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
[1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of sample indexes for each valid
combination of prevalence values is indicated by <cite>repeats</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>sample_size</strong> the number of instances in each sample (i.e., length of each index)</p></li>
<li><p><strong>n_prevalences</strong> the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
<li><p><strong>repeats</strong> the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yield the indexes that generate the samples according to APP</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.binary">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.binary" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns True if the number of classes is 2</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>boolean</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.counts">
<span class="sig-name descname"><span class="pre">counts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.counts" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns the number of instances for each of the classes of interest.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the number of instances of each class, in the same order
as listed by <cite>self.classes_</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.kFCV">
<span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.kFCV" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Generator of stratified folds to be used in k-fold cross validation.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>nfolds</strong> integer (default 5), the number of folds to generate</p></li>
<li><p><strong>nrepeats</strong> integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
<li><p><strong>random_state</strong> integer (default 0), guarantees that the folds generated are reproducible</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.load">
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition"></a></dt>
<dd><p>Loads a labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance. The function in charge
of reading the instances must be specified. This function can be a custom one, or any of the reading functions
defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> string, the path to the file containing the labelled instances</p></li>
<li><p><strong>loader_func</strong> a custom function that implements the data loader and returns a tuple with instances and
labels</p></li>
<li><p><strong>classes</strong> array-like, the classes according to which the instances are labelled</p></li>
<li><p><strong>loader_kwargs</strong> any argument that the <cite>loader_func</cite> function needs in order to read the instances, i.e.,
these arguments are used to call <cite>loader_func(path, **loader_kwargs)</cite></p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.n_classes">
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.n_classes" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>The number of classes</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_generator">
<span class="sig-name descname"><span class="pre">natural_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_generator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>sample_size</strong> integer, the number of instances in each sample</p></li>
<li><p><strong>repeats</strong> the number of samples to generate</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yield instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_index_generator">
<span class="sig-name descname"><span class="pre">natural_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_index_generator" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>sample_size</strong> integer, the number of instances in each sample (i.e., the length of each index)</p></li>
<li><p><strong>repeats</strong> the number of indexes to generate</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>yield <cite>repeats</cite> instances of np.ndarray with shape <cite>(sample_size,)</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.prevalence">
<span class="sig-name descname"><span class="pre">prevalence</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.prevalence" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns the prevalence, or relative frequency, of the classes of interest.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the relative frequencies of each class, in the same order
as listed by <cite>self.classes_</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling">
<span class="sig-name descname"><span class="pre">sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Return a random sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size and desired prevalence
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>size</strong> integer, the requested size</p></li>
<li><p><strong>prevs</strong> the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
<li><p><strong>shuffle</strong> if set to True (default), shuffles the index before returning it</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite> and prevalence close to <cite>prevs</cite> (or
prevalence == <cite>prevs</cite> if the exact prevalence values can be met as proportions of instances)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_from_index">
<span class="sig-name descname"><span class="pre">sampling_from_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">index</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_from_index" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> whose elements are sampled from this collection using the
index.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>index</strong> np.ndarray</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_index">
<span class="sig-name descname"><span class="pre">sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_index" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling.
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>size</strong> integer, the requested size</p></li>
<li><p><strong>prevs</strong> the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
<li><p><strong>shuffle</strong> if set to True (default), shuffles the index before returning it</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.split_stratified">
<span class="sig-name descname"><span class="pre">split_stratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_prop</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.split_stratified" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> split with stratification from this collection, at desired
proportion.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>train_prop</strong> the proportion of elements to include in the left-most returned collection (typically used
as the training collection). The rest of elements are included in the right-most returned collection
(typically used as a test collection).</p></li>
<li><p><strong>random_state</strong> if specified, guarantees reproducibility of the split.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>, the first one with <cite>train_prop</cite> elements, and the
second one with <cite>1-train_prop</cite> elements</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.stats">
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.stats" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">training</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1">#instances=3821, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>show</strong> if set to True (default), prints the stats in standard output</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a dictionary containing some stats of this collection. Keys include <cite>#instances</cite> (the number of
instances), <cite>type</cite> (the type representing the instances), <cite>#features</cite> (the number of features, if the
instances are in array-like format), <cite>#classes</cite> (the classes of the collection), <cite>prevs</cite> (the prevalence
values for each class)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling">
<span class="sig-name descname"><span class="pre">uniform_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns a uniform sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>size</strong> integer, the requested size</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling_index">
<span class="sig-name descname"><span class="pre">uniform_sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling_index" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>size</strong> integer, the size of the uniform sample</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.base.isbinary">
<span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">isbinary</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.isbinary" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Returns True if <cite>data</cite> is either a binary <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a binary <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>data</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>True if labelled according to two classes</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-quapy.data.datasets">
<span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Permalink to this headline"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.df_replace">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">df_replace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">df</span></em>, <em class="sig-param"><span class="pre">col</span></em>, <em class="sig-param"><span class="pre">repl={'no':</span> <span class="pre">0</span></em>, <em class="sig-param"><span class="pre">'yes':</span> <span class="pre">1}</span></em>, <em class="sig-param"><span class="pre">astype=&lt;class</span> <span class="pre">'float'&gt;</span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.datasets.df_replace" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIDataset">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIDataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_split</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCIDataset" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Loads a UCI dataset as an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>, as used in
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100.</a>
and
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15.</a>.
The datasets do not come with a predefined train-test split (see <a class="reference internal" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="quapy.data.datasets.fetch_UCILabelledCollection"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fetch_UCILabelledCollection()</span></code></a> for further
information on how to use these collections), and so a train-test split is generated at desired proportion.
The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> a dataset name</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>test_split</strong> proportion of documents to be included in the test set. The rest conforms the training set</p></li>
<li><p><strong>verbose</strong> set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCILabelledCollection">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCILabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Loads a UCI collection as an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, as used in
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100.</a>
and
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15.</a>.
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
This can be reproduced by using <a class="reference internal" href="#quapy.data.base.Dataset.kFCV" title="quapy.data.base.Dataset.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.data.base.Dataset.kFCV()</span></code></a>, e.g.:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">collection</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCILabelledCollection</span><span class="p">(</span><span class="s2">&quot;yeast&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">data</span> <span class="ow">in</span> <span class="n">qp</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">Dataset</span><span class="o">.</span><span class="n">kFCV</span><span class="p">(</span><span class="n">collection</span><span class="p">,</span> <span class="n">nfolds</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">nrepeats</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
<span class="gp">&gt;&gt;&gt; </span> <span class="o">...</span>
</pre></div>
</div>
<p>The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> a dataset name</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>test_split</strong> proportion of documents to be included in the test set. The rest conforms the training set</p></li>
<li><p><strong>verbose</strong> set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_reviews">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_reviews</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tfidf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_reviews" title="Permalink to this definition"></a></dt>
<dd><p>Load a Reviews dataset as a Dataset instance, as used in:
Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
:param dataset_name: the name of the dataset: valid ones are hp, kindle, imdb
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
:param min_df: minimun number of documents that should contain a term in order for the term to be
kept (ignored if tfidf==False)
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance</p>
<dd><p>Loads a Reviews dataset as a Dataset instance, as used in
<a class="reference external" href="https://dl.acm.org/doi/abs/10.1145/3269206.3269287">Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.</a>.
The list of valid dataset names can be accessed in <cite>quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> the name of the dataset: valid ones are hp, kindle, imdb</p></li>
<li><p><strong>tfidf</strong> set to True to transform the raw documents into tfidf weighted matrices</p></li>
<li><p><strong>min_df</strong> minimun number of documents that should contain a term in order for the term to be
kept (ignored if tfidf==False)</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>pickle</strong> set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_twitter">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_twitter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_twitter" title="Permalink to this definition"></a></dt>
<dd><p>Load a Twitter dataset as a Dataset instance, as used in:
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
The datasets semeval13, semeval14, semeval15 share the same training set.</p>
<dd><p>Loads a Twitter dataset as a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance, as used in:
<a class="reference external" href="https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf">Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)</a>
Note that the datasets semeval13, semeval14, semeval15 share the same training set.
The list of valid dataset names corresponding to training sets can be accessed in
<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN</cite>, while the test sets can be accessed in
<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>dataset_name</strong> the name of the dataset: valid ones are gasp, hcr, omd, sanders, semeval13,</p>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> the name of the dataset: valid ones are gasp, hcr, omd, sanders, semeval13,
semeval14, semeval15, semeval16, sst, wa, wb</p></li>
<li><p><strong>for_model_selection</strong> if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set</p></li>
<li><p><strong>min_df</strong> minimun number of documents that should contain a term in order for the term to be kept</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>pickle</strong> set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
<p>semeval14, semeval15, semeval16, sst, wa, wb
:param for_model_selection: if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance</p>
</dd></dl>
<dl class="py function">
@ -278,15 +696,41 @@ faster subsequent invokations
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer">
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">IndexTransformer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>This class implements a sklearns-style transformer that indexes text as numerical ids for the tokens it
contains, and that would be generated by sklearns
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>kwargs</strong> <p>keyworded arguments from <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
</p>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.add_word">
<span class="sig-name descname"><span class="pre">add_word</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">word</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nogaps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.add_word" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
Useful to define special tokens for codifying unknown words, or padding tokens.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>word</strong> string, surface form of the token</p></li>
<li><p><strong>id</strong> integer, numerical value to assign to the token (leave as None for indicating the next valid id,
default)</p></li>
<li><p><strong>nogaps</strong> if set to True (default) asserts that the id indicated leads to no numerical gaps with
precedent ids stored so far</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>integer, the numerical id for the new token</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit">
<span class="sig-name descname"><span class="pre">fit</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit" title="Permalink to this definition"></a></dt>
<dd><dl class="field-list simple">
<dd><p>Fits the transformer, i.e., decides on the vocabulary, given a list of strings.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>X</strong> a list of strings</p>
</dd>
@ -299,66 +743,139 @@ faster subsequent invokations
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit_transform">
<span class="sig-name descname"><span class="pre">fit_transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit_transform" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.index">
<span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">documents</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.index" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Fits the transform on <cite>X</cite> and transforms it.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>X</strong> a list of strings</p></li>
<li><p><strong>n_jobs</strong> the number of parallel workers to carry out this task</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.transform">
<span class="sig-name descname"><span class="pre">transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.transform" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Transforms the strings in <cite>X</cite> as lists of numerical ids</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>X</strong> a list of strings</p></li>
<li><p><strong>n_jobs</strong> the number of parallel workers to carry out this task</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.vocabulary_size">
<span class="sig-name descname"><span class="pre">vocabulary_size</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.vocabulary_size" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Gets the length of the vocabulary according to which the document tokens have been indexed</p>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.index">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.index" title="Permalink to this definition"></a></dt>
<dd><p>Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
:param dataset: a Dataset where the instances are lists of str
:param min_df: minimum number of instances below which the term is replaced by a UNK index
:param inplace: whether or not to apply the transformation inplace, or to a new copy
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
consisting of lists of integer values representing indices.</p>
<dd><p>Indexes the tokens of a textual <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of string documents.
To index a document means to replace each different token by a unique numerical index.
Rare words (i.e., words occurring less than <cite>min_df</cite> times) are replaced by a special token <cite>UNK</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object where the instances of training and test documents
are lists of str</p></li>
<li><p><strong>min_df</strong> minimum number of occurrences below which the term is replaced by a <cite>UNK</cite> index</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
<li><p><strong>kwargs</strong> the rest of parameters of the transformation (as for sklearns</p></li>
</ul>
</dd>
</dl>
<p><cite>CountVectorizer &lt;https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html&gt;_</cite>)
:return: a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current</p>
<blockquote>
<div><p><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) consisting of lists of integer values representing indices.</p>
</div></blockquote>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.reduce_columns">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">reduce_columns</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.reduce_columns" title="Permalink to this definition"></a></dt>
<dd><p>Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
_min_df_ instances
:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
:param min_df: minimum number of instances below which the columns are removed
:param inplace: whether or not to apply the transformation inplace, or to a new copy
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
where the dimensions corresponding to infrequent instances have been removed</p>
<dd><p>Reduces the dimensionality of the instances, represented as a <cite>csr_matrix</cite> (or any subtype of
<cite>scipy.sparse.spmatrix</cite>), of training and test documents by removing the columns of words which are not present
in at least <cite>min_df</cite> instances in the training set</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in which instances are represented in sparse format (any
subtype of scipy.sparse.spmatrix)</p></li>
<li><p><strong>min_df</strong> integer, minimum number of instances below which the columns are removed</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) where the dimensions corresponding to infrequent terms
in the training set have been removed</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.standardize">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition"></a></dt>
<dd><p>Standardizes the real-valued columns of a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>.
Standardization, aka z-scoring, of a variable <cite>X</cite> comes down to subtracting the average and normalizing by the
standard deviation.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object</p></li>
<li><p><strong>inplace</strong> set to True if the transformation is to be applied inplace, or to False (default) if a new
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> is to be returned</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p></p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.text2tfidf">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">text2tfidf</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sublinear_tf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.text2tfidf" title="Permalink to this definition"></a></dt>
<dd><p>Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
:param dataset: a Dataset where the instances are lists of str
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
:param sublinear_tf: whether or not to apply the log scalling to the tf counters
:param inplace: whether or not to apply the transformation inplace, or to a new copy
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
<dd><p>Transforms a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of textual instances into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of
tfidf weighted sparse vectors</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> where the instances of training and test collections are
lists of str</p></li>
<li><p><strong>min_df</strong> minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)</p></li>
<li><p><strong>sublinear_tf</strong> whether or not to apply the log scalling to the tf counters (default True)</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
<li><p><strong>kwargs</strong> the rest of parameters of the transformation (as for sklearns
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html">TfidfVectorizer</a>)</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in <cite>csr_matrix</cite> format (if inplace=False) or a reference to the
current Dataset (if inplace=True) where the instances are stored in a <cite>csr_matrix</cite> of real-valued tfidf scores</p>
</dd>
</dl>
</dd></dl>
</section>
@ -367,7 +884,24 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.binarize">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">binarize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pos_class</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.binarize" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Binarizes a categorical array-like collection of labels towards the positive class <cite>pos_class</cite>. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">binarize</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">array</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">])</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>y</strong> array-like of labels</p></li>
<li><p><strong>pos_class</strong> integer, the positive class</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a binary np.ndarray, in which values 1 corresponds to positions in whcih <cite>y</cite> had <cite>pos_class</cite> labels, and
0 otherwise</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.from_csv">
@ -376,10 +910,13 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
File format &lt;label&gt;,&lt;feat1&gt;,&lt;feat2&gt;,…,&lt;featn&gt;</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>path</strong> path to the csv file</p>
<dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> path to the csv file</p></li>
<li><p><strong>encoding</strong> the text encoding used to open the file</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a ndarray for the labels and a ndarray (float) for the covariates</p>
<dd class="field-even"><p>a np.ndarray for the labels and a ndarray (float) for the covariates</p>
</dd>
</dl>
</dd></dl>
@ -394,7 +931,7 @@ File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
<dd class="field-odd"><p><strong>path</strong> path to the labelled collection</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a csr_matrix containing the instances (rows), and a ndarray containing the labels</p>
<dd class="field-even"><p>a <cite>csr_matrix</cite> containing the instances (rows), and a ndarray containing the labels</p>
</dd>
</dl>
</dd></dl>
@ -406,7 +943,11 @@ File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
File fomart &lt;0 or 1&gt; &lt;document&gt;</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>path</strong> path to the labelled collection</p>
<dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> path to the labelled collection</p></li>
<li><p><strong>encoding</strong> the text encoding used to open the file</p></li>
<li><p><strong>verbose</strong> if &gt;0 (default) shows some progress information in standard output</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a list of sentences, and a list of labels</p>
@ -418,9 +959,19 @@ File fomart &lt;0 or 1&gt; &lt;document&gt;</p>
<dt class="sig sig-object py" id="quapy.data.reader.reindex_labels">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">reindex_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.reindex_labels" title="Permalink to this definition"></a></dt>
<dd><p>Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
E.g., y=[B, B, A, C] -&gt; [1,1,0,2], [A,B,C]
:param y: the list or array of original labels
:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
E.g.:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">reindex_labels</span><span class="p">([</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">]),</span> <span class="n">array</span><span class="p">([</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">&#39;&lt;U1&#39;</span><span class="p">))</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>y</strong> the list or array of original labels</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
</dd>
</dl>
</dd></dl>
</section>

View File

@ -515,14 +515,20 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
<dl class="py function">
<dt class="sig sig-object py" id="quapy.evaluation.artificial_prevalence_prediction">
<span class="sig-prename descclassname"><span class="pre">quapy.evaluation.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_prediction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.method.html#quapy.method.base.BaseQuantifier" title="quapy.method.base.BaseQuantifier"><span class="pre">quapy.method.base.BaseQuantifier</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.data.html#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevpoints</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">210</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_repetitions</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eval_budget</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">42</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.evaluation.artificial_prevalence_prediction" title="Permalink to this definition"></a></dt>
<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.
:param model: the model in charge of generating the class prevalence estimations
:param test: the test set on which to perform arificial sampling
:param sample_size: the size of the samples
:param n_prevpoints: the number of different prevalences to sample (or set to None if eval_budget is specified)
:param n_repetitions: the number of repetitions for each prevalence
:param eval_budget: if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3
classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>model</strong> the model in charge of generating the class prevalence estimations</p></li>
<li><p><strong>test</strong> the test set on which to perform arificial sampling</p></li>
<li><p><strong>sample_size</strong> the size of the samples</p></li>
<li><p><strong>n_prevpoints</strong> the number of different prevalences to sample (or set to None if eval_budget is specified)</p></li>
<li><p><strong>n_repetitions</strong> the number of repetitions for each prevalence</p></li>
<li><p><strong>eval_budget</strong> if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3</p></li>
</ul>
</dd>
</dl>
<p>classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
different prevalences ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] … [1, 0, 0]) and since setting it n_prevpoints
to 6 would produce more than 20 evaluations.
:param n_jobs: number of jobs to be run in parallel
@ -601,7 +607,31 @@ contains the the prevalence estimations</p>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.artificial_prevalence_sampling">
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dimensions</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_constrained_dim</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.artificial_prevalence_sampling" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dd><p>Generates vectors of prevalence values artificially drawn from an exhaustive grid of prevalence values. The
number of prevalence values explored for each dimension depends on <cite>n_prevalences</cite>, so that, if, for example,
<cite>n_prevalences=11</cite> then the prevalence values of the grid are taken from [0, 0.1, 0.2, …, 0.9, 1]. Only
valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
valid vector of prevalence values, <cite>repeat</cite> copies are returned. The vector of prevalence values can be
implicit (by setting <cite>return_constrained_dim=False</cite>), meaning that the last dimension (which is constrained
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to 1).</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dimensions</strong> the number of classes</p></li>
<li><p><strong>n_prevalences</strong> the number of equidistant prevalence points to extract from the [0,1] interval for the grid
(default is 21)</p></li>
<li><p><strong>repeat</strong> number of copies for each valid prevalence vector (default is 1)</p></li>
<li><p><strong>return_constrained_dim</strong> set to True to return all dimensions, or to False (default) for ommitting the
constrained dimension</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an ndarray of shape <cite>(n, dimensions)</cite> if <cite>return_constrained_dim=True</cite> or of shape <cite>(n, dimensions-1)</cite>
if <cite>return_constrained_dim=False</cite>, where <cite>n</cite> is the number of valid combinations found in the grid multiplied
by <cite>repeat</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.get_nprevpoints_approximation">
@ -634,8 +664,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
<dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.prevalence_from_labels">
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition"></a></dt>
<dd><p>Computed the prevalence values from a vector of labels.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>labels</strong> array-like of shape <cite>(n_instances)</cite> with the label for each instance</p></li>
<li><p><strong>classes</strong> the class labels. This is needed in order to correctly compute the prevalence vector even when
some classes have no examples.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an ndarray of shape <cite>(len(classes))</cite> with the class prevalence values</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.prevalence_from_probabilities">
@ -645,13 +688,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
<dl class="py function">
<dt class="sig sig-object py" id="quapy.functional.prevalence_linspace">
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_linspace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smooth_limits_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.01</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_linspace" title="Permalink to this definition"></a></dt>
<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
and with the limits smoothed, i.e.:
[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
:return: an array of uniformly separated prevalence values</p>
<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array of 21 prevalence values, with
step 0.05 and with the limits smoothed, i.e.:
[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>n_prevalences</strong> the number of prevalence values to sample from the [0,1] interval (default 21)</p></li>
<li><p><strong>repeat</strong> number of times each prevalence is to be repeated (defaults to 1)</p></li>
<li><p><strong>smooth_limits_epsilon</strong> the quantity to add and subtract to the limits 0 and 1</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>an array of uniformly separated prevalence values</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,3 @@
from abc import abstractmethod
from typing import List, Union
import numpy as np
from scipy.sparse import issparse
from scipy.sparse import vstack
@ -9,18 +6,19 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from quapy.functional import artificial_prevalence_sampling, strprev
class LabelledCollection:
'''
A LabelledCollection is a set of objects each with a label associated to it.
'''
"""
A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
routines.
:param instances: array-like (np.ndarray, list, or csr_matrix are supported)
:param labels: array-like with the same length of instances
:param classes_: optional, list of classes from which labels are taken. If not specified, the classes are inferred
from the labels. The classes must be indicated in cases in which some of the labels might have no examples
(i.e., a prevalence of 0)
"""
def __init__(self, instances, labels, classes_=None):
"""
:param instances: list of objects
:param labels: list of labels, same length of instances
:param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
"""
if issparse(instances):
self.instances = instances
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
@ -42,28 +40,81 @@ class LabelledCollection:
@classmethod
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
"""
Loads a labelled set of data and convert it into a :class:`LabelledCollection` instance. The function in charge
of reading the instances must be specified. This function can be a custom one, or any of the reading functions
defined in :mod:`quapy.data.reader` module.
:param path: string, the path to the file containing the labelled instances
:param loader_func: a custom function that implements the data loader and returns a tuple with instances and
labels
:param classes: array-like, the classes according to which the instances are labelled
:param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances, i.e.,
these arguments are used to call `loader_func(path, **loader_kwargs)`
:return: a :class:`LabelledCollection` object
"""
return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
def __len__(self):
"""
Returns the length of this collection (number of labelled instances)
:return: integer
"""
return self.instances.shape[0]
def prevalence(self):
"""
Returns the prevalence, or relative frequency, of the classes of interest.
:return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order
as listed by `self.classes_`
"""
return self.counts() / len(self)
def counts(self):
"""
Returns the number of instances for each of the classes of interest.
:return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order
as listed by `self.classes_`
"""
return np.asarray([len(self.index[class_]) for class_ in self.classes_])
@property
def n_classes(self):
"""
The number of classes
:return: integer
"""
return len(self.classes_)
@property
def binary(self):
"""
Returns True if the number of classes is 2
:return: boolean
"""
return self.n_classes == 2
def sampling_index(self, size, *prevs, shuffle=True):
"""
Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling.
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.
:param size: integer, the requested size
:param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
`self.classes_` can be specified, while the other class takes prevalence value `1-p`
:param shuffle: if set to True (default), shuffles the index before returning it
:return: a np.ndarray of shape `(size)` with the indexes
"""
if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling
return np.random.choice(len(self), size, replace=False)
return self.uniform_sampling_index(size)
if len(prevs) == self.n_classes - 1:
prevs = prevs + (1 - sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
@ -93,47 +144,142 @@ class LabelledCollection:
return indexes_sample
def uniform_sampling_index(self, size):
"""
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the size of the uniform sample
:return: a np.ndarray of shape `(size)` with the indexes
"""
return np.random.choice(len(self), size, replace=False)
def uniform_sampling(self, size):
unif_index = self.uniform_sampling_index(size)
return self.sampling_from_index(unif_index)
def sampling(self, size, *prevs, shuffle=True):
"""
Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.
:param size: integer, the requested size
:param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
`self.classes_` can be specified, while the other class takes prevalence value `1-p`
:param shuffle: if set to True (default), shuffles the index before returning it
:return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or
prevalence == `prevs` if the exact prevalence values can be met as proportions of instances)
"""
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
return self.sampling_from_index(prev_index)
def uniform_sampling(self, size):
"""
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the requested size
:return: an instance of :class:`LabelledCollection` with length == `size`
"""
unif_index = self.uniform_sampling_index(size)
return self.sampling_from_index(unif_index)
def sampling_from_index(self, index):
"""
Returns an instance of :class:`LabelledCollection` whose elements are sampled from this collection using the
index.
:param index: np.ndarray
:return: an instance of :class:`LabelledCollection`
"""
documents = self.instances[index]
labels = self.labels[index]
return LabelledCollection(documents, labels, classes_=self.classes_)
def split_stratified(self, train_prop=0.6, random_state=None):
# with temp_seed(42):
"""
Returns two instances of :class:`LabelledCollection` split with stratification from this collection, at desired
proportion.
:param train_prop: the proportion of elements to include in the left-most returned collection (typically used
as the training collection). The rest of elements are included in the right-most returned collection
(typically used as a test collection).
:param random_state: if specified, guarantees reproducibility of the split.
:return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
second one with `1-train_prop` elements
"""
tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
random_state=random_state)
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
"""
A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
combination of prevalence values is indicated by `repeats`
:param sample_size: the number of instances in each sample
:param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
:param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
:return: yield samples generated at artificially controlled prevalence values
"""
dimensions = self.n_classes
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling(sample_size, *prevs)
def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
"""
A generator of sample indexes implementing the artificial prevalence protocol (APP).
The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of sample indexes for each valid
combination of prevalence values is indicated by `repeats`
:param sample_size: the number of instances in each sample (i.e., length of each index)
:param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
:param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
:return: yield the indexes that generate the samples according to APP
"""
dimensions = self.n_classes
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling_index(sample_size, *prevs)
def natural_sampling_generator(self, sample_size, repeats=100):
"""
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate
:return: yield instances of :class:`LabelledCollection`
"""
for _ in range(repeats):
yield self.uniform_sampling(sample_size)
def natural_sampling_index_generator(self, sample_size, repeats=100):
"""
A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
:param sample_size: integer, the number of instances in each sample (i.e., the length of each index)
:param repeats: the number of indexes to generate
:return: yield `repeats` instances of np.ndarray with shape `(sample_size,)`
"""
for _ in range(repeats):
yield self.uniform_sampling_index(sample_size)
def __add__(self, other):
"""
Returns a new :class:`LabelledCollection` as the union of this collection with another collection
:param other: another :class:`LabelledCollection`
:return: a :class:`LabelledCollection` representing the union of both collections
"""
if other is None:
return self
elif issparse(self.instances) and issparse(other.instances):
@ -149,9 +295,29 @@ class LabelledCollection:
@property
def Xy(self):
"""
Gets the instances and labels. This is useful when working with `sklearn` estimators, e.g.:
>>> svm = LinearSVC().fit(*my_collection.Xy)
:return: a tuple `(instances, labels)` from this collection
"""
return self.instances, self.labels
def stats(self, show=True):
"""
Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
>>> data.training.stats()
>>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]
:param show: if set to True (default), prints the stats in standard output
:return: a dictionary containing some stats of this collection. Keys include `#instances` (the number of
instances), `type` (the type representing the instances), `#features` (the number of features, if the
instances are in array-like format), `#classes` (the classes of the collection), `prevs` (the prevalence
values for each class)
"""
ninstances = len(self)
instance_type = type(self.instances[0])
if instance_type == list:
@ -171,6 +337,14 @@ class LabelledCollection:
return stats_
def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
"""
Generator of stratified folds to be used in k-fold cross validation.
:param nfolds: integer (default 5), the number of folds to generate
:param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
:param random_state: integer (default 0), guarantees that the folds generated are reproducible
:return: yields `nfolds * nrepeats` folds for k-fold cross validation
"""
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
for train_index, test_index in kf.split(*self.Xy):
train = self.sampling_from_index(train_index)
@ -178,8 +352,15 @@ class LabelledCollection:
yield train, test
class Dataset:
"""
Abstraction of training and test :class:`LabelledCollection` objects.
:param training: a :class:`LabelledCollection` instance
:param test: a :class:`LabelledCollection` instance
:param vocabulary: if indicated, is a dictionary of the terms used in this textual dataset
:param name: a string representing the name of the dataset
"""
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
@ -190,45 +371,118 @@ class Dataset:
@classmethod
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
"""
Generates a :class:`Dataset` from a stratified split of a :class:`LabelledCollection` instance.
See :meth:`LabelledCollection.split_stratified`
:param collection: :class:`LabelledCollection`
:param train_size: the proportion of training documents (the rest conforms the test split)
:return: an instance of :class:`Dataset`
"""
return Dataset(*collection.split_stratified(train_prop=train_size))
@property
def classes_(self):
"""
The classes according to which the training collection is labelled
:return: The classes according to which the training collection is labelled
"""
return self.training.classes_
@property
def n_classes(self):
"""
The number of classes according to which the training collection is labelled
:return: integer
"""
return self.training.n_classes
@property
def binary(self):
"""
Returns True if the training collection is labelled according to two classes
:return: boolean
"""
return self.training.binary
@classmethod
def load(cls, train_path, test_path, loader_func: callable):
training = LabelledCollection.load(train_path, loader_func)
test = LabelledCollection.load(test_path, loader_func)
def load(cls, train_path, test_path, loader_func: callable, classes=None, **loader_kwargs):
"""
Loads a training and a test labelled set of data and convert it into a :class:`Dataset` instance.
The function in charge of reading the instances must be specified. This function can be a custom one, or any of
the reading functions defined in :mod:`quapy.data.reader` module.
:param train_path: string, the path to the file containing the training instances
:param test_path: string, the path to the file containing the test instances
:param loader_func: a custom function that implements the data loader and returns a tuple with instances and
labels
:param classes: array-like, the classes according to which the instances are labelled
:param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances.
See :meth:`LabelledCollection.load` for further details.
:return: a :class:`Dataset` object
"""
training = LabelledCollection.load(train_path, loader_func, classes, **loader_kwargs)
test = LabelledCollection.load(test_path, loader_func, classes, **loader_kwargs)
return Dataset(training, test)
@property
def vocabulary_size(self):
"""
If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary
:return: integer
"""
return len(self.vocabulary)
def stats(self):
def stats(self, show):
"""
Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
>>> data.stats()
>>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]
:param show: if set to True (default), prints the stats in standard output
:return: a dictionary containing some stats of this collection for the training and test collections. The keys
are `train` and `test`, and point to dedicated dictionaries of stats, for each collection, with keys
`#instances` (the number of instances), `type` (the type representing the instances),
`#features` (the number of features, if the instances are in array-like format), `#classes` (the classes of
the collection), `prevs` (the prevalence values for each class)
"""
tr_stats = self.training.stats(show=False)
te_stats = self.test.stats(show=False)
print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
if show:
print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
return {'train': tr_stats, 'test': te_stats}
@classmethod
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
"""
Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
:meth:`LabelledCollection.kFCV` that returns :class:`Dataset` instances made of training and test folds.
:param nfolds: integer (default 5), the number of folds to generate
:param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
:param random_state: integer (default 0), guarantees that the folds generated are reproducible
:return: yields `nfolds * nrepeats` folds for k-fold cross validation as instances of :class:`Dataset`
"""
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
def isbinary(data):
"""
Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
:param data: a :class:`Dataset` or a :class:`LabelledCollection` object
:return: True if labelled according to two classes
"""
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
return data.binary
return False

View File

@ -5,9 +5,6 @@ warnings.warn = warn
import os
import zipfile
from os.path import join
from urllib.error import HTTPError
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from quapy.data.base import Dataset, LabelledCollection
@ -49,18 +46,20 @@ UCI_DATASETS = ['acute.a', 'acute.b',
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
"""
Load a Reviews dataset as a Dataset instance, as used in:
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
Loads a Reviews dataset as a Dataset instance, as used in
`Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
:param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
:param min_df: minimun number of documents that should contain a term in order for the term to be
kept (ignored if tfidf==False)
kept (ignored if tfidf==False)
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance
faster subsequent invokations
:return: a :class:`quapy.data.base.Dataset` instance
"""
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
@ -93,22 +92,25 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
"""
Load a Twitter dataset as a Dataset instance, as used in:
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
The datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
`Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
The list of valid dataset names corresponding to training sets can be accessed in
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`
:param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
:param for_model_selection: if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance
faster subsequent invokations
:return: a :class:`quapy.data.base.Dataset` instance
"""
assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
@ -163,11 +165,58 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
"""
Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
and
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
information on how to use these collections), and so a train-test split is generated at desired proportion.
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
"""
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
"""
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
and
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
>>> import quapy as qp
>>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
>>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
"""
assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
@ -302,7 +351,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
[df_replace(df, col) for col in range(1, 6)]
[_df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes')
@ -482,5 +531,5 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
return data
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)

View File

@ -12,14 +12,18 @@ from .base import LabelledCollection
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
"""
Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
:param dataset: a Dataset where the instances are lists of str
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
:param sublinear_tf: whether or not to apply the log scalling to the tf counters
:param inplace: whether or not to apply the transformation inplace, or to a new copy
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
where the instances are stored in a csr_matrix of real-valued tfidf scores
Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of
tfidf weighted sparse vectors
:param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are
lists of str
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)
:param sublinear_tf: whether or not to apply the log scalling to the tf counters (default True)
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
:param kwargs: the rest of parameters of the transformation (as for sklearn's
`TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_)
:return: a new :class:`quapy.data.base.Dataset` in `csr_matrix` format (if inplace=False) or a reference to the
current Dataset (if inplace=True) where the instances are stored in a `csr_matrix` of real-valued tfidf scores
"""
__check_type(dataset.training.instances, np.ndarray, str)
__check_type(dataset.test.instances, np.ndarray, str)
@ -41,13 +45,17 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
"""
Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
_min_df_ instances
:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
:param min_df: minimum number of instances below which the columns are removed
:param inplace: whether or not to apply the transformation inplace, or to a new copy
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
where the dimensions corresponding to infrequent instances have been removed
Reduces the dimensionality of the instances, represented as a `csr_matrix` (or any subtype of
`scipy.sparse.spmatrix`), of training and test documents by removing the columns of words which are not present
in at least `min_df` instances in the training set
:param dataset: a :class:`quapy.data.base.Dataset` in which instances are represented in sparse format (any
subtype of scipy.sparse.spmatrix)
:param min_df: integer, minimum number of instances below which the columns are removed
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
:return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
:class:`quapy.data.base.Dataset` (inplace=True) where the dimensions corresponding to infrequent terms
in the training set have been removed
"""
__check_type(dataset.training.instances, spmatrix)
__check_type(dataset.test.instances, spmatrix)
@ -71,7 +79,17 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
return Dataset(training, test)
def standardize(dataset: Dataset, inplace=True):
def standardize(dataset: Dataset, inplace=False):
"""
Standardizes the real-valued columns of a :class:`quapy.data.base.Dataset`.
Standardization, aka z-scoring, of a variable `X` comes down to subtracting the average and normalizing by the
standard deviation.
:param dataset: a :class:`quapy.data.base.Dataset` object
:param inplace: set to True if the transformation is to be applied inplace, or to False (default) if a new
:class:`quapy.data.base.Dataset` is to be returned
:return:
"""
s = StandardScaler(copy=not inplace)
training = s.fit_transform(dataset.training.instances)
test = s.transform(dataset.test.instances)
@ -83,14 +101,18 @@ def standardize(dataset: Dataset, inplace=True):
def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
"""
Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
:param dataset: a Dataset where the instances are lists of str
:param min_df: minimum number of instances below which the term is replaced by a UNK index
:param inplace: whether or not to apply the transformation inplace, or to a new copy
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
consisting of lists of integer values representing indices.
Indexes the tokens of a textual :class:`quapy.data.base.Dataset` of string documents.
To index a document means to replace each different token by a unique numerical index.
Rare words (i.e., words occurring less than `min_df` times) are replaced by a special token `UNK`
:param dataset: a :class:`quapy.data.base.Dataset` object where the instances of training and test documents
are lists of str
:param min_df: minimum number of occurrences below which the term is replaced by a `UNK` index
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
:param kwargs: the rest of parameters of the transformation (as for sklearn's
`CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_`)
:return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
:class:`quapy.data.base.Dataset` (inplace=True) consisting of lists of integer values representing indices.
"""
__check_type(dataset.training.instances, np.ndarray, str)
__check_type(dataset.test.instances, np.ndarray, str)
@ -120,17 +142,23 @@ def __check_type(container, container_type=None, element_type=None):
class IndexTransformer:
"""
This class implements a sklearn's-style transformer that indexes text as numerical ids for the tokens it
contains, and that would be generated by sklearn's
`CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
:param kwargs: keyworded arguments from `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
"""
def __init__(self, **kwargs):
"""
:param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
"""
self.vect = CountVectorizer(**kwargs)
self.unk = -1 # a valid index is assigned after fit
self.pad = -2 # a valid index is assigned after fit
def fit(self, X):
"""
Fits the transformer, i.e., decides on the vocabulary, given a list of strings.
:param X: a list of strings
:return: self
"""
@ -142,22 +170,52 @@ class IndexTransformer:
return self
def transform(self, X, n_jobs=-1):
"""
Transforms the strings in `X` as lists of numerical ids
:param X: a list of strings
:param n_jobs: the number of parallel workers to carry out this task
:return: a `np.ndarray` of numerical ids
"""
# given the number of tasks and the number of jobs, generates the slices for the parallel processes
assert self.unk != -1, 'transform called before fit'
indexed = map_parallel(func=self.index, args=X, n_jobs=n_jobs)
indexed = map_parallel(func=self._index, args=X, n_jobs=n_jobs)
return np.asarray(indexed)
def index(self, documents):
def _index(self, documents):
vocab = self.vocabulary_.copy()
return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
def fit_transform(self, X, n_jobs=-1):
"""
Fits the transform on `X` and transforms it.
:param X: a list of strings
:param n_jobs: the number of parallel workers to carry out this task
:return: a `np.ndarray` of numerical ids
"""
return self.fit(X).transform(X, n_jobs=n_jobs)
def vocabulary_size(self):
"""
Gets the length of the vocabulary according to which the document tokens have been indexed
:return: integer
"""
return len(self.vocabulary_)
def add_word(self, word, id=None, nogaps=True):
"""
Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
Useful to define special tokens for codifying unknown words, or padding tokens.
:param word: string, surface form of the token
:param id: integer, numerical value to assign to the token (leave as None for indicating the next valid id,
default)
:param nogaps: if set to True (default) asserts that the id indicated leads to no numerical gaps with
precedent ids stored so far
:return: integer, the numerical id for the new token
"""
if word in self.vocabulary_:
raise ValueError(f'word {word} already in dictionary')
if id is None:

View File

@ -7,7 +7,10 @@ def from_text(path, encoding='utf-8', verbose=1, class2int=True):
"""
Reads a labelled colletion of documents.
File fomart <0 or 1>\t<document>\n
:param path: path to the labelled collection
:param encoding: the text encoding used to open the file
:param verbose: if >0 (default) shows some progress information in standard output
:return: a list of sentences, and a list of labels
"""
all_sentences, all_labels = [], []
@ -35,8 +38,9 @@ def from_sparse(path):
"""
Reads a labelled collection of real-valued instances expressed in sparse format
File format <-1 or 0 or 1>[\s col(int):val(float)]\n
:param path: path to the labelled collection
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
:return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels
"""
def split_col_val(col_val):
@ -68,8 +72,10 @@ def from_csv(path, encoding='utf-8'):
"""
Reads a csv file in which columns are separated by ','.
File format <label>,<feat1>,<feat2>,...,<featn>\n
:param path: path to the csv file
:return: a ndarray for the labels and a ndarray (float) for the covariates
:param encoding: the text encoding used to open the file
:return: a np.ndarray for the labels and a ndarray (float) for the covariates
"""
X, y = [], []
@ -85,11 +91,16 @@ def from_csv(path, encoding='utf-8'):
def reindex_labels(y):
"""
Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
E.g., y=['B', 'B', 'A', 'C'] -> [1,1,0,2], ['A','B','C']
E.g.:
>>> reindex_labels(['B', 'B', 'A', 'C'])
>>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))
:param y: the list or array of original labels
:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.
"""
classnames = sorted(np.unique(y))
y = np.asarray(y)
classnames = np.asarray(sorted(np.unique(y)))
label2index = {label: index for index, label in enumerate(classnames)}
indexed = np.empty(y.shape, dtype=np.int)
for label in classnames:
@ -98,6 +109,17 @@ def reindex_labels(y):
def binarize(y, pos_class):
"""
Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,:
>>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)
>>> array([0, 1, 0, 0, 0, 0])
:param y: array-like of labels
:param pos_class: integer, the positive class
:return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and
0 otherwise
"""
y = np.asarray(y)
ybin = np.zeros(y.shape, dtype=np.int)
ybin[y == pos_class] = 1