updating the documentation

2021-12-06 18:25:47 +01:00 · 2021-12-06 18:25:47 +01:00 · 2bd47f0841
parent 1f591ec105
commit 2bd47f0841
9 changed files with 1197 additions and 218 deletions
--- a/docs/build/html/genindex.html
+++ b/docs/build/html/genindex.html
@ -255,12 +255,8 @@
        <li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.device">(quapy.method.neural.QuaNetModule property)</a>
 </li>
      </ul></li>
      <li><a href="quapy.data.html#quapy.data.datasets.df_replace">df_replace() (in module quapy.data.datasets)</a>
 </li>
      <li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.dimensions">dimensions() (quapy.classification.neural.TextClassifierNet method)</a>
 </li>
  </ul></td>
  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.classification.html#quapy.classification.neural.CNNnet.document_embedding">document_embedding() (quapy.classification.neural.CNNnet method)</a>
      <ul>
@ -269,6 +265,8 @@
        <li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.document_embedding">(quapy.classification.neural.TextClassifierNet method)</a>
 </li>
      </ul></li>
  </ul></td>
  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.html#quapy.util.download_file">download_file() (in module quapy.util)</a>
 </li>
      <li><a href="quapy.html#quapy.util.download_file_if_not_exists">download_file_if_not_exists() (in module quapy.util)</a>
@ -462,19 +460,15 @@
 <table style="width: 100%" class="indextable genindextable"><tr>
  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.data.html#quapy.data.preprocessing.index">index() (in module quapy.data.preprocessing)</a>
      <ul>
        <li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer.index">(quapy.data.preprocessing.IndexTransformer method)</a>
 </li>
      </ul></li>
      <li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer">IndexTransformer (class in quapy.data.preprocessing)</a>
 </li>
      <li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.init_hidden">init_hidden() (quapy.method.neural.QuaNetModule method)</a>
 </li>
  </ul></td>
  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.method.html#quapy.method.base.isaggregative">isaggregative() (in module quapy.method.base)</a>
 </li>
  </ul></td>
  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.html#quapy.isbinary">isbinary() (in module quapy)</a>
      <ul>
--- a/docs/build/html/objects.inv
+++ b/docs/build/html/objects.inv
--- a/docs/build/html/quapy.data.html
+++ b/docs/build/html/quapy.data.html
@ -63,45 +63,144 @@
 <dt class="sig sig-object py" id="quapy.data.base.Dataset">
 <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">Dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">training</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">vocabulary</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset" title="Permalink to this definition">¶</a></dt>
 <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
 <p>Abstraction of training and test <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> objects.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>training</strong> – a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
 <li><p><strong>test</strong> – a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
 <li><p><strong>vocabulary</strong> – if indicated, is a dictionary of the terms used in this textual dataset</p></li>
 <li><p><strong>name</strong> – a string representing the name of the dataset</p></li>
 </ul>
 </dd>
 </dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.SplitStratified">
 <em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">SplitStratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">collection</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">train_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.SplitStratified" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Generates a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> from a stratified split of a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance.
 See <a class="reference internal" href="#quapy.data.base.LabelledCollection.split_stratified" title="quapy.data.base.LabelledCollection.split_stratified"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.split_stratified()</span></code></a></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>collection</strong> – <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p></li>
 <li><p><strong>train_size</strong> – the proportion of training documents (the rest conforms the test split)</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.binary">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.Dataset.binary" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns True if the training collection is labelled according to two classes</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>boolean</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.classes_">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.data.base.Dataset.classes_" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>The classes according to which the training collection is labelled</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>The classes according to which the training collection is labelled</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.kFCV">
 <em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.kFCV" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
 <a class="reference internal" href="#quapy.data.base.LabelledCollection.kFCV" title="quapy.data.base.LabelledCollection.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.kFCV()</span></code></a> that returns <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instances made of training and test folds.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>nfolds</strong> – integer (default 5), the number of folds to generate</p></li>
 <li><p><strong>nrepeats</strong> – integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
 <li><p><strong>random_state</strong> – integer (default 0), guarantees that the folds generated are reproducible</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation as instances of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.load">
-<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Loads a training and a test labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instance.
 The function in charge of reading the instances must be specified. This function can be a custom one, or any of
 the reading functions defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>train_path</strong> – string, the path to the file containing the training instances</p></li>
 <li><p><strong>test_path</strong> – string, the path to the file containing the test instances</p></li>
 <li><p><strong>loader_func</strong> – a custom function that implements the data loader and returns a tuple with instances and
 labels</p></li>
 <li><p><strong>classes</strong> – array-like, the classes according to which the instances are labelled</p></li>
 <li><p><strong>loader_kwargs</strong> – any argument that the <cite>loader_func</cite> function needs in order to read the instances.
 See <a class="reference internal" href="#quapy.data.base.LabelledCollection.load" title="quapy.data.base.LabelledCollection.load"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.load()</span></code></a> for further details.</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.n_classes">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.Dataset.n_classes" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>The number of classes according to which the training collection is labelled</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>integer</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.stats">
-<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:</p>
 <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="n">Dataset</span><span class="o">=</span><span class="n">kindle</span> <span class="c1">#tr-instances=3821, #te-instances=21591, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]</span>
 </pre></div>
 </div>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>show</strong> – if set to True (default), prints the stats in standard output</p>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a dictionary containing some stats of this collection for the training and test collections. The keys
 are <cite>train</cite> and <cite>test</cite>, and point to dedicated dictionaries of stats, for each collection, with keys
 <cite>#instances</cite> (the number of instances), <cite>type</cite> (the type representing the instances),
 <cite>#features</cite> (the number of features, if the instances are in array-like format), <cite>#classes</cite> (the classes of
 the collection), <cite>prevs</cite> (the prevalence values for each class)</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.vocabulary_size">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">vocabulary_size</span></span><a class="headerlink" href="#quapy.data.base.Dataset.vocabulary_size" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>integer</p>
 </dd>
 </dl>
 </dd></dl>
 </dd></dl>
@ -109,161 +208,480 @@
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection">
 <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">LabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection" title="Permalink to this definition">¶</a></dt>
 <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
-<p>A LabelledCollection is a set of objects each with a label associated to it.</p>
+<p>A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
 routines.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>instances</strong> – array-like (np.ndarray, list, or csr_matrix are supported)</p></li>
 <li><p><strong>labels</strong> – array-like with the same length of instances</p></li>
 <li><p><strong>classes</strong> – optional, list of classes from which labels are taken. If not specified, the classes are inferred
 from the labels. The classes must be indicated in cases in which some of the labels might have no examples
 (i.e., a prevalence of 0)</p></li>
 </ul>
 </dd>
 </dl>
 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.Xy">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">Xy</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.Xy" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Gets the instances and labels. This is useful when working with <cite>sklearn</cite> estimators, e.g.:</p>
 <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span> <span class="o">=</span> <span class="n">LinearSVC</span><span class="p">()</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="o">*</span><span class="n">my_collection</span><span class="o">.</span><span class="n">Xy</span><span class="p">)</span>
 </pre></div>
 </div>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>a tuple <cite>(instances, labels)</cite> from this collection</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_generator">
 <span class="sig-name descname"><span class="pre">artificial_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_generator" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
 a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
 prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
 [1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of samples for each valid
 combination of prevalence values is indicated by <cite>repeats</cite></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>sample_size</strong> – the number of instances in each sample</p></li>
 <li><p><strong>n_prevalences</strong> – the number of prevalence points to be taken from the [0,1] interval (including the
 limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
 <li><p><strong>repeats</strong> – the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>yield samples generated at artificially controlled prevalence values</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_index_generator">
 <span class="sig-name descname"><span class="pre">artificial_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_index_generator" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>A generator of sample indexes implementing the artificial prevalence protocol (APP).
 The APP consists of exploring
 a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
 prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
 [1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of sample indexes for each valid
 combination of prevalence values is indicated by <cite>repeats</cite></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>sample_size</strong> – the number of instances in each sample (i.e., length of each index)</p></li>
 <li><p><strong>n_prevalences</strong> – the number of prevalence points to be taken from the [0,1] interval (including the
 limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
 <li><p><strong>repeats</strong> – the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>yield the indexes that generate the samples according to APP</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.binary">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.binary" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns True if the number of classes is 2</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>boolean</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.counts">
 <span class="sig-name descname"><span class="pre">counts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.counts" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns the number of instances for each of the classes of interest.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the number of instances of each class, in the same order
 as listed by <cite>self.classes_</cite></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.kFCV">
 <span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.kFCV" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Generator of stratified folds to be used in k-fold cross validation.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>nfolds</strong> – integer (default 5), the number of folds to generate</p></li>
 <li><p><strong>nrepeats</strong> – integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
 <li><p><strong>random_state</strong> – integer (default 0), guarantees that the folds generated are reproducible</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.load">
-<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Loads a labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance. The function in charge
 of reading the instances must be specified. This function can be a custom one, or any of the reading functions
 defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>path</strong> – string, the path to the file containing the labelled instances</p></li>
 <li><p><strong>loader_func</strong> – a custom function that implements the data loader and returns a tuple with instances and
 labels</p></li>
 <li><p><strong>classes</strong> – array-like, the classes according to which the instances are labelled</p></li>
 <li><p><strong>loader_kwargs</strong> – any argument that the <cite>loader_func</cite> function needs in order to read the instances, i.e.,
 these arguments are used to call <cite>loader_func(path, **loader_kwargs)</cite></p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.n_classes">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.n_classes" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>The number of classes</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>integer</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_generator">
 <span class="sig-name descname"><span class="pre">natural_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_generator" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
 samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>sample_size</strong> – integer, the number of instances in each sample</p></li>
 <li><p><strong>repeats</strong> – the number of samples to generate</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>yield instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_index_generator">
 <span class="sig-name descname"><span class="pre">natural_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_index_generator" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
 samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>sample_size</strong> – integer, the number of instances in each sample (i.e., the length of each index)</p></li>
 <li><p><strong>repeats</strong> – the number of indexes to generate</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>yield <cite>repeats</cite> instances of np.ndarray with shape <cite>(sample_size,)</cite></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.prevalence">
 <span class="sig-name descname"><span class="pre">prevalence</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.prevalence" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns the prevalence, or relative frequency, of the classes of interest.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the relative frequencies of each class, in the same order
 as listed by <cite>self.classes_</cite></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling">
 <span class="sig-name descname"><span class="pre">sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Return a random sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size and desired prevalence
 values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
 the actual prevalence of the class, or with replacement otherwise.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>size</strong> – integer, the requested size</p></li>
 <li><p><strong>prevs</strong> – the prevalence for each class; the prevalence value for the last class can be lead empty since
 it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
 <cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
 <li><p><strong>shuffle</strong> – if set to True (default), shuffles the index before returning it</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite> and prevalence close to <cite>prevs</cite> (or
 prevalence == <cite>prevs</cite> if the exact prevalence values can be met as proportions of instances)</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_from_index">
 <span class="sig-name descname"><span class="pre">sampling_from_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">index</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_from_index" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> whose elements are sampled from this collection using the
 index.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>index</strong> – np.ndarray</p>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_index">
 <span class="sig-name descname"><span class="pre">sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_index" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
 prevalence values are not specified, then returns the index of a uniform sampling.
 For each class, the sampling is drawn without replacement if the requested prevalence is larger than
 the actual prevalence of the class, or with replacement otherwise.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>size</strong> – integer, the requested size</p></li>
 <li><p><strong>prevs</strong> – the prevalence for each class; the prevalence value for the last class can be lead empty since
 it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
 <cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
 <li><p><strong>shuffle</strong> – if set to True (default), shuffles the index before returning it</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.split_stratified">
 <span class="sig-name descname"><span class="pre">split_stratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_prop</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.split_stratified" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> split with stratification from this collection, at desired
 proportion.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>train_prop</strong> – the proportion of elements to include in the left-most returned collection (typically used
 as the training collection). The rest of elements are included in the right-most returned collection
 (typically used as a test collection).</p></li>
 <li><p><strong>random_state</strong> – if specified, guarantees reproducibility of the split.</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>, the first one with <cite>train_prop</cite> elements, and the
 second one with <cite>1-train_prop</cite> elements</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.stats">
 <span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.stats" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:</p>
 <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">training</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="c1">#instances=3821, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]</span>
 </pre></div>
 </div>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>show</strong> – if set to True (default), prints the stats in standard output</p>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a dictionary containing some stats of this collection. Keys include <cite>#instances</cite> (the number of
 instances), <cite>type</cite> (the type representing the instances), <cite>#features</cite> (the number of features, if the
 instances are in array-like format), <cite>#classes</cite> (the classes of the collection), <cite>prevs</cite> (the prevalence
 values for each class)</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling">
 <span class="sig-name descname"><span class="pre">uniform_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns a uniform sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size. The sampling is drawn
 without replacement if the requested size is greater than the number of instances, or with replacement
 otherwise.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>size</strong> – integer, the requested size</p>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling_index">
 <span class="sig-name descname"><span class="pre">uniform_sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling_index" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
 without replacement if the requested size is greater than the number of instances, or with replacement
 otherwise.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>size</strong> – integer, the size of the uniform sample</p>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
 </dd>
 </dl>
 </dd></dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.base.isbinary">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">isbinary</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.isbinary" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns True if <cite>data</cite> is either a binary <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a binary <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>data</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>True if labelled according to two classes</p>
 </dd>
 </dl>
 </dd></dl>
 </section>
 <section id="module-quapy.data.datasets">
 <span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Permalink to this headline">¶</a></h2>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.df_replace">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">df_replace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">df</span></em>, <em class="sig-param"><span class="pre">col</span></em>, <em class="sig-param"><span class="pre">repl={'no':</span> <span class="pre">0</span></em>, <em class="sig-param"><span class="pre">'yes':</span> <span class="pre">1}</span></em>, <em class="sig-param"><span class="pre">astype=&lt;class</span> <span class="pre">'float'&gt;</span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.datasets.df_replace" title="Permalink to this definition">¶</a></dt>
 <dd></dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIDataset">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIDataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_split</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCIDataset" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Loads a UCI dataset as an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>, as used in
 <a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
 Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
 Information Fusion, 34, 87-100.</a>
 and
 <a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
 Dynamic ensemble selection for quantification tasks.
 Information Fusion, 45, 1-15.</a>.
 The datasets do not come with a predefined train-test split (see <a class="reference internal" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="quapy.data.datasets.fetch_UCILabelledCollection"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fetch_UCILabelledCollection()</span></code></a> for further
 information on how to use these collections), and so a train-test split is generated at desired proportion.
 The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>dataset_name</strong> – a dataset name</p></li>
 <li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
 ~/quay_data/ directory)</p></li>
 <li><p><strong>test_split</strong> – proportion of documents to be included in the test set. The rest conforms the training set</p></li>
 <li><p><strong>verbose</strong> – set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCILabelledCollection">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCILabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Loads a UCI collection as an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, as used in
 <a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
 Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
 Information Fusion, 34, 87-100.</a>
 and
 <a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
 Dynamic ensemble selection for quantification tasks.
 Information Fusion, 45, 1-15.</a>.
 The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
 protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
 This can be reproduced by using <a class="reference internal" href="#quapy.data.base.Dataset.kFCV" title="quapy.data.base.Dataset.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.data.base.Dataset.kFCV()</span></code></a>, e.g.:</p>
 <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="n">collection</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCILabelledCollection</span><span class="p">(</span><span class="s2">&quot;yeast&quot;</span><span class="p">)</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">data</span> <span class="ow">in</span> <span class="n">qp</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">Dataset</span><span class="o">.</span><span class="n">kFCV</span><span class="p">(</span><span class="n">collection</span><span class="p">,</span> <span class="n">nfolds</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">nrepeats</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
 <span class="gp">&gt;&gt;&gt; </span>    <span class="o">...</span>
 </pre></div>
 </div>
 <p>The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>dataset_name</strong> – a dataset name</p></li>
 <li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
 ~/quay_data/ directory)</p></li>
 <li><p><strong>test_split</strong> – proportion of documents to be included in the test set. The rest conforms the training set</p></li>
 <li><p><strong>verbose</strong> – set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.fetch_reviews">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_reviews</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tfidf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_reviews" title="Permalink to this definition">¶</a></dt>
-<dd><p>Load a Reviews dataset as a Dataset instance, as used in:
+<dd><p>Loads a Reviews dataset as a Dataset instance, as used in
-Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
+<a class="reference external" href="https://dl.acm.org/doi/abs/10.1145/3269206.3269287">Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
-Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
+Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.</a>.
-:param dataset_name: the name of the dataset: valid ones are ‘hp’, ‘kindle’, ‘imdb’
+The list of valid dataset names can be accessed in <cite>quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS</cite></p>
-:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
+<dl class="field-list simple">
-:param min_df: minimun number of documents that should contain a term in order for the term to be
+<dt class="field-odd">Parameters</dt>
-kept (ignored if tfidf==False)
+<dd class="field-odd"><ul class="simple">
-:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+<li><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘hp’, ‘kindle’, ‘imdb’</p></li>
-~/quay_data/ directory)
+<li><p><strong>tfidf</strong> – set to True to transform the raw documents into tfidf weighted matrices</p></li>
-:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
+<li><p><strong>min_df</strong> – minimun number of documents that should contain a term in order for the term to be
-faster subsequent invokations
+kept (ignored if tfidf==False)</p></li>
-:return: a Dataset instance</p>
+<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
 ~/quay_data/ directory)</p></li>
 <li><p><strong>pickle</strong> – set to True to pickle the Dataset object the first time it is generated, in order to allow for
 faster subsequent invokations</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.fetch_twitter">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_twitter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_twitter" title="Permalink to this definition">¶</a></dt>
-<dd><p>Load a Twitter dataset as a Dataset instance, as used in:
+<dd><p>Loads a Twitter dataset as a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance, as used in:
-Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
+<a class="reference external" href="https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf">Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
-Social Network Analysis and Mining6(19), 1–22 (2016)
+Social Network Analysis and Mining6(19), 1–22 (2016)</a>
-The datasets ‘semeval13’, ‘semeval14’, ‘semeval15’ share the same training set.</p>
+Note that the datasets ‘semeval13’, ‘semeval14’, ‘semeval15’ share the same training set.
 The list of valid dataset names corresponding to training sets can be accessed in
 <cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN</cite>, while the test sets can be accessed in
 <cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST</cite></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
-<dd class="field-odd"><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘gasp’, ‘hcr’, ‘omd’, ‘sanders’, ‘semeval13’,</p>
+<dd class="field-odd"><ul class="simple">
 <li><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘gasp’, ‘hcr’, ‘omd’, ‘sanders’, ‘semeval13’,
 ‘semeval14’, ‘semeval15’, ‘semeval16’, ‘sst’, ‘wa’, ‘wb’</p></li>
 <li><p><strong>for_model_selection</strong> – if True, then returns the train split as the training set and the devel split
 as the test set; if False, then returns the train+devel split as the training set and the test set as the
 test set</p></li>
 <li><p><strong>min_df</strong> – minimun number of documents that should contain a term in order for the term to be kept</p></li>
 <li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
 ~/quay_data/ directory)</p></li>
 <li><p><strong>pickle</strong> – set to True to pickle the Dataset object the first time it is generated, in order to allow for
 faster subsequent invokations</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
 </dd>
 </dl>
 <p>‘semeval14’, ‘semeval15’, ‘semeval16’, ‘sst’, ‘wa’, ‘wb’
 :param for_model_selection: if True, then returns the train split as the training set and the devel split
 as the test set; if False, then returns the train+devel split as the training set and the test set as the
 test set
 :param min_df: minimun number of documents that should contain a term in order for the term to be kept
 :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 ~/quay_data/ directory)
 :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
 faster subsequent invokations
 :return: a Dataset instance</p>
 </dd></dl>
 <dl class="py function">
@ -278,15 +696,41 @@ faster subsequent invokations
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer">
 <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">IndexTransformer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer" title="Permalink to this definition">¶</a></dt>
 <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
 <p>This class implements a sklearn’s-style transformer that indexes text as numerical ids for the tokens it
 contains, and that would be generated by sklearn’s
 <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>kwargs</strong> – <p>keyworded arguments from <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
 </p>
 </dd>
 </dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.add_word">
 <span class="sig-name descname"><span class="pre">add_word</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">word</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nogaps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.add_word" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
 Useful to define special tokens for codifying unknown words, or padding tokens.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>word</strong> – string, surface form of the token</p></li>
 <li><p><strong>id</strong> – integer, numerical value to assign to the token (leave as None for indicating the next valid id,
 default)</p></li>
 <li><p><strong>nogaps</strong> – if set to True (default) asserts that the id indicated leads to no numerical gaps with
 precedent ids stored so far</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>integer, the numerical id for the new token</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit">
 <span class="sig-name descname"><span class="pre">fit</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit" title="Permalink to this definition">¶</a></dt>
-<dd><dl class="field-list simple">
+<dd><p>Fits the transformer, i.e., decides on the vocabulary, given a list of strings.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>X</strong> – a list of strings</p>
 </dd>
@ -299,66 +743,139 @@ faster subsequent invokations
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit_transform">
 <span class="sig-name descname"><span class="pre">fit_transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit_transform" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Fits the transform on <cite>X</cite> and transforms it.</p>
-
+<dl class="field-list simple">
-<dl class="py method">
+<dt class="field-odd">Parameters</dt>
-<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.index">
+<dd class="field-odd"><ul class="simple">
-<span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">documents</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.index" title="Permalink to this definition">¶</a></dt>
+<li><p><strong>X</strong> – a list of strings</p></li>
-<dd></dd></dl>
+<li><p><strong>n_jobs</strong> – the number of parallel workers to carry out this task</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.transform">
 <span class="sig-name descname"><span class="pre">transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.transform" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Transforms the strings in <cite>X</cite> as lists of numerical ids</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>X</strong> – a list of strings</p></li>
 <li><p><strong>n_jobs</strong> – the number of parallel workers to carry out this task</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.vocabulary_size">
 <span class="sig-name descname"><span class="pre">vocabulary_size</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.vocabulary_size" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Gets the length of the vocabulary according to which the document tokens have been indexed</p>
 <dl class="field-list simple">
 <dt class="field-odd">Returns</dt>
 <dd class="field-odd"><p>integer</p>
 </dd>
 </dl>
 </dd></dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.index">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.index" title="Permalink to this definition">¶</a></dt>
-<dd><p>Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
+<dd><p>Indexes the tokens of a textual <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of string documents.
-Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
+To index a document means to replace each different token by a unique numerical index.
-:param dataset: a Dataset where the instances are lists of str
+Rare words (i.e., words occurring less than <cite>min_df</cite> times) are replaced by a special token <cite>UNK</cite></p>
-:param min_df: minimum number of instances below which the term is replaced by a UNK index
+<dl class="field-list simple">
-:param inplace: whether or not to apply the transformation inplace, or to a new copy
+<dt class="field-odd">Parameters</dt>
-:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
+<dd class="field-odd"><ul class="simple">
-:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
+<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object where the instances of training and test documents
-consisting of lists of integer values representing indices.</p>
+are lists of str</p></li>
 <li><p><strong>min_df</strong> – minimum number of occurrences below which the term is replaced by a <cite>UNK</cite> index</p></li>
 <li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
 <li><p><strong>kwargs</strong> – the rest of parameters of the transformation (as for sklearn’s</p></li>
 </ul>
 </dd>
 </dl>
 <p><cite>CountVectorizer &lt;https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html&gt;_</cite>)
 :return: a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current</p>
 <blockquote>
 <div><p><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) consisting of lists of integer values representing indices.</p>
 </div></blockquote>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.reduce_columns">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">reduce_columns</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.reduce_columns" title="Permalink to this definition">¶</a></dt>
-<dd><p>Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
+<dd><p>Reduces the dimensionality of the instances, represented as a <cite>csr_matrix</cite> (or any subtype of
-_min_df_ instances
+<cite>scipy.sparse.spmatrix</cite>), of training and test documents by removing the columns of words which are not present
-:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
+in at least <cite>min_df</cite> instances in the training set</p>
-:param min_df: minimum number of instances below which the columns are removed
+<dl class="field-list simple">
-:param inplace: whether or not to apply the transformation inplace, or to a new copy
+<dt class="field-odd">Parameters</dt>
-:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
+<dd class="field-odd"><ul class="simple">
-where the dimensions corresponding to infrequent instances have been removed</p>
+<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in which instances are represented in sparse format (any
 subtype of scipy.sparse.spmatrix)</p></li>
 <li><p><strong>min_df</strong> – integer, minimum number of instances below which the columns are removed</p></li>
 <li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current
 <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) where the dimensions corresponding to infrequent terms
 in the training set have been removed</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.standardize">
-<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Standardizes the real-valued columns of a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>.
 Standardization, aka z-scoring, of a variable <cite>X</cite> comes down to subtracting the average and normalizing by the
 standard deviation.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object</p></li>
 <li><p><strong>inplace</strong> – set to True if the transformation is to be applied inplace, or to False (default) if a new
 <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> is to be returned</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.text2tfidf">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">text2tfidf</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sublinear_tf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.text2tfidf" title="Permalink to this definition">¶</a></dt>
-<dd><p>Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
+<dd><p>Transforms a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of textual instances into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of
-:param dataset: a Dataset where the instances are lists of str
+tfidf weighted sparse vectors</p>
-:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
+<dl class="field-list simple">
-:param sublinear_tf: whether or not to apply the log scalling to the tf counters
+<dt class="field-odd">Parameters</dt>
-:param inplace: whether or not to apply the transformation inplace, or to a new copy
+<dd class="field-odd"><ul class="simple">
-:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
+<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> where the instances of training and test collections are
-:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
+lists of str</p></li>
-where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
+<li><p><strong>min_df</strong> – minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)</p></li>
 <li><p><strong>sublinear_tf</strong> – whether or not to apply the log scalling to the tf counters (default True)</p></li>
 <li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
 <li><p><strong>kwargs</strong> – the rest of parameters of the transformation (as for sklearn’s
 <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html">TfidfVectorizer</a>)</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in <cite>csr_matrix</cite> format (if inplace=False) or a reference to the
 current Dataset (if inplace=True) where the instances are stored in a <cite>csr_matrix</cite> of real-valued tfidf scores</p>
 </dd>
 </dl>
 </dd></dl>
 </section>
@ -367,7 +884,24 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.reader.binarize">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">binarize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pos_class</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.binarize" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Binarizes a categorical array-like collection of labels towards the positive class <cite>pos_class</cite>. E.g.,:</p>
 <div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">binarize</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
 <span class="gp">&gt;&gt;&gt; </span><span class="n">array</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">])</span>
 </pre></div>
 </div>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>y</strong> – array-like of labels</p></li>
 <li><p><strong>pos_class</strong> – integer, the positive class</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a binary np.ndarray, in which values 1 corresponds to positions in whcih <cite>y</cite> had <cite>pos_class</cite> labels, and
 0 otherwise</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.reader.from_csv">
@ -376,10 +910,13 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
 File format &lt;label&gt;,&lt;feat1&gt;,&lt;feat2&gt;,…,&lt;featn&gt;</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
-<dd class="field-odd"><p><strong>path</strong> – path to the csv file</p>
+<dd class="field-odd"><ul class="simple">
 <li><p><strong>path</strong> – path to the csv file</p></li>
 <li><p><strong>encoding</strong> – the text encoding used to open the file</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
-<dd class="field-even"><p>a ndarray for the labels and a ndarray (float) for the covariates</p>
+<dd class="field-even"><p>a np.ndarray for the labels and a ndarray (float) for the covariates</p>
 </dd>
 </dl>
 </dd></dl>
@ -394,7 +931,7 @@ File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
 <dd class="field-odd"><p><strong>path</strong> – path to the labelled collection</p>
 </dd>
 <dt class="field-even">Returns</dt>
-<dd class="field-even"><p>a csr_matrix containing the instances (rows), and a ndarray containing the labels</p>
+<dd class="field-even"><p>a <cite>csr_matrix</cite> containing the instances (rows), and a ndarray containing the labels</p>
 </dd>
 </dl>
 </dd></dl>
@ -406,7 +943,11 @@ File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
 File fomart &lt;0 or 1&gt;        &lt;document&gt;</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
-<dd class="field-odd"><p><strong>path</strong> – path to the labelled collection</p>
+<dd class="field-odd"><ul class="simple">
 <li><p><strong>path</strong> – path to the labelled collection</p></li>
 <li><p><strong>encoding</strong> – the text encoding used to open the file</p></li>
 <li><p><strong>verbose</strong> – if &gt;0 (default) shows some progress information in standard output</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a list of sentences, and a list of labels</p>
@ -418,9 +959,19 @@ File fomart &lt;0 or 1&gt;        &lt;document&gt;</p>
 <dt class="sig sig-object py" id="quapy.data.reader.reindex_labels">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">reindex_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.reindex_labels" title="Permalink to this definition">¶</a></dt>
 <dd><p>Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
-E.g., y=[‘B’, ‘B’, ‘A’, ‘C’] -&gt; [1,1,0,2], [‘A’,’B’,’C’]
+E.g.:</p>
-:param y: the list or array of original labels
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">reindex_labels</span><span class="p">([</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">])</span>
-:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
+<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">]),</span> <span class="n">array</span><span class="p">([</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">&#39;&lt;U1&#39;</span><span class="p">))</span>
 </pre></div>
 </div>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>y</strong> – the list or array of original labels</p>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
 </dd>
 </dl>
 </dd></dl>
 </section>
--- a/docs/build/html/quapy.html
+++ b/docs/build/html/quapy.html
@ -515,14 +515,20 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.evaluation.artificial_prevalence_prediction">
 <span class="sig-prename descclassname"><span class="pre">quapy.evaluation.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_prediction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.method.html#quapy.method.base.BaseQuantifier" title="quapy.method.base.BaseQuantifier"><span class="pre">quapy.method.base.BaseQuantifier</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.data.html#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevpoints</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">210</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_repetitions</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eval_budget</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">42</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.evaluation.artificial_prevalence_prediction" title="Permalink to this definition">¶</a></dt>
-<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.
+<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.</p>
-:param model: the model in charge of generating the class prevalence estimations
+<dl class="field-list simple">
-:param test: the test set on which to perform arificial sampling
+<dt class="field-odd">Parameters</dt>
-:param sample_size: the size of the samples
+<dd class="field-odd"><ul class="simple">
-:param n_prevpoints: the number of different prevalences to sample (or set to None if eval_budget is specified)
+<li><p><strong>model</strong> – the model in charge of generating the class prevalence estimations</p></li>
-:param n_repetitions: the number of repetitions for each prevalence
+<li><p><strong>test</strong> – the test set on which to perform arificial sampling</p></li>
-:param eval_budget: if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3
+<li><p><strong>sample_size</strong> – the size of the samples</p></li>
-classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
+<li><p><strong>n_prevpoints</strong> – the number of different prevalences to sample (or set to None if eval_budget is specified)</p></li>
 <li><p><strong>n_repetitions</strong> – the number of repetitions for each prevalence</p></li>
 <li><p><strong>eval_budget</strong> – if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3</p></li>
 </ul>
 </dd>
 </dl>
 <p>classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
 different prevalences ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] … [1, 0, 0]) and since setting it n_prevpoints
 to 6 would produce more than 20 evaluations.
 :param n_jobs: number of jobs to be run in parallel
@ -601,7 +607,31 @@ contains the the prevalence estimations</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.artificial_prevalence_sampling">
 <span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dimensions</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_constrained_dim</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.artificial_prevalence_sampling" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Generates vectors of prevalence values artificially drawn from an exhaustive grid of prevalence values. The
 number of prevalence values explored for each dimension depends on <cite>n_prevalences</cite>, so that, if, for example,
 <cite>n_prevalences=11</cite> then the prevalence values of the grid are taken from [0, 0.1, 0.2, …, 0.9, 1]. Only
 valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
 valid vector of prevalence values, <cite>repeat</cite> copies are returned. The vector of prevalence values can be
 implicit (by setting <cite>return_constrained_dim=False</cite>), meaning that the last dimension (which is constrained
 to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to 1).</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>dimensions</strong> – the number of classes</p></li>
 <li><p><strong>n_prevalences</strong> – the number of equidistant prevalence points to extract from the [0,1] interval for the grid
 (default is 21)</p></li>
 <li><p><strong>repeat</strong> – number of copies for each valid prevalence vector (default is 1)</p></li>
 <li><p><strong>return_constrained_dim</strong> – set to True to return all dimensions, or to False (default) for ommitting the
 constrained dimension</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>an ndarray of shape <cite>(n, dimensions)</cite> if <cite>return_constrained_dim=True</cite> or of shape <cite>(n, dimensions-1)</cite>
 if <cite>return_constrained_dim=False</cite>, where <cite>n</cite> is the number of valid combinations found in the grid multiplied
 by <cite>repeat</cite></p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.get_nprevpoints_approximation">
@ -634,8 +664,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.prevalence_from_labels">
-<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Computed the prevalence values from a vector of labels.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>labels</strong> – array-like of shape <cite>(n_instances)</cite> with the label for each instance</p></li>
 <li><p><strong>classes</strong> – the class labels. This is needed in order to correctly compute the prevalence vector even when
 some classes have no examples.</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>an ndarray of shape <cite>(len(classes))</cite> with the class prevalence values</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.prevalence_from_probabilities">
@ -645,13 +688,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.prevalence_linspace">
 <span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_linspace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smooth_limits_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.01</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_linspace" title="Permalink to this definition">¶</a></dt>
-<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
+<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array of 21 prevalence values, with
-and with the limits smoothed, i.e.:
+step 0.05 and with the limits smoothed, i.e.:
-[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]
+[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]</p>
-:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
+<dl class="field-list simple">
-:param repeat: number of times each prevalence is to be repeated (defaults to 1)
+<dt class="field-odd">Parameters</dt>
-:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
+<dd class="field-odd"><ul class="simple">
-:return: an array of uniformly separated prevalence values</p>
+<li><p><strong>n_prevalences</strong> – the number of prevalence values to sample from the [0,1] interval (default 21)</p></li>
 <li><p><strong>repeat</strong> – number of times each prevalence is to be repeated (defaults to 1)</p></li>
 <li><p><strong>smooth_limits_epsilon</strong> – the quantity to add and subtract to the limits 0 and 1</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>an array of uniformly separated prevalence values</p>
 </dd>
 </dl>
 </dd></dl>
 <dl class="py function">
--- a/docs/build/html/searchindex.js
+++ b/docs/build/html/searchindex.js
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -1,6 +1,3 @@
 from abc import abstractmethod
 from typing import List, Union
 import numpy as np
 from scipy.sparse import issparse
 from scipy.sparse import vstack
@ -9,18 +6,19 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
 from quapy.functional import artificial_prevalence_sampling, strprev
 class LabelledCollection:
-    '''
+    """
-    A LabelledCollection is a set of objects each with a label associated to it.
+    A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
-    '''
+    routines.
    :param instances: array-like (np.ndarray, list, or csr_matrix are supported)
    :param labels: array-like with the same length of instances
    :param classes_: optional, list of classes from which labels are taken. If not specified, the classes are inferred
        from the labels. The classes must be indicated in cases in which some of the labels might have no examples
        (i.e., a prevalence of 0)
    """
    def __init__(self, instances, labels, classes_=None):
        """
        :param instances: list of objects
        :param labels: list of labels, same length of instances
        :param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
        """
        if issparse(instances):
            self.instances = instances
        elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
@ -42,28 +40,81 @@ class LabelledCollection:
    @classmethod
    def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
        """
        Loads a labelled set of data and convert it into a :class:`LabelledCollection` instance. The function in charge
        of reading the instances must be specified. This function can be a custom one, or any of the reading functions
        defined in :mod:`quapy.data.reader` module.
        :param path: string, the path to the file containing the labelled instances
        :param loader_func: a custom function that implements the data loader and returns a tuple with instances and
            labels
        :param classes: array-like, the classes according to which the instances are labelled
        :param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances, i.e.,
            these arguments are used to call `loader_func(path, **loader_kwargs)`
        :return: a :class:`LabelledCollection` object
        """
        return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
    def __len__(self):
        """
        Returns the length of this collection (number of labelled instances)
        :return: integer
        """
        return self.instances.shape[0]
    def prevalence(self):
        """
        Returns the prevalence, or relative frequency, of the classes of interest.
        :return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order
            as listed by `self.classes_`
        """
        return self.counts() / len(self)
    def counts(self):
        """
        Returns the number of instances for each of the classes of interest.
        :return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order
            as listed by `self.classes_`
        """
        return np.asarray([len(self.index[class_]) for class_ in self.classes_])
    @property
    def n_classes(self):
        """
        The number of classes
        :return: integer
        """
        return len(self.classes_)
    @property
    def binary(self):
        """
        Returns True if the number of classes is 2
        :return: boolean
        """
        return self.n_classes == 2
    def sampling_index(self, size, *prevs, shuffle=True):
        """
        Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
        prevalence values are not specified, then returns the index of a uniform sampling.
        For each class, the sampling is drawn without replacement if the requested prevalence is larger than
        the actual prevalence of the class, or with replacement otherwise.
        :param size: integer, the requested size
        :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
            it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
            `self.classes_` can be specified, while the other class takes prevalence value `1-p`
        :param shuffle: if set to True (default), shuffles the index before returning it
        :return: a np.ndarray of shape `(size)` with the indexes
        """
        if len(prevs) == 0:  # no prevalence was indicated; returns an index for uniform sampling
-            return np.random.choice(len(self), size, replace=False)
+            return self.uniform_sampling_index(size)
        if len(prevs) == self.n_classes - 1:
            prevs = prevs + (1 - sum(prevs),)
        assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
@ -93,47 +144,142 @@ class LabelledCollection:
        return indexes_sample
    def uniform_sampling_index(self, size):
        """
        Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
        without replacement if the requested size is greater than the number of instances, or with replacement
        otherwise.
        :param size: integer, the size of the uniform sample
        :return: a np.ndarray of shape `(size)` with the indexes
        """
        return np.random.choice(len(self), size, replace=False)
    def uniform_sampling(self, size):
        unif_index = self.uniform_sampling_index(size)
        return self.sampling_from_index(unif_index)
    def sampling(self, size, *prevs, shuffle=True):
        """
        Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence
        values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
        the actual prevalence of the class, or with replacement otherwise.
        :param size: integer, the requested size
        :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
            it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
            `self.classes_` can be specified, while the other class takes prevalence value `1-p`
        :param shuffle: if set to True (default), shuffles the index before returning it
        :return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or
            prevalence == `prevs` if the exact prevalence values can be met as proportions of instances)
        """
        prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
        return self.sampling_from_index(prev_index)
    def uniform_sampling(self, size):
        """
        Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
        without replacement if the requested size is greater than the number of instances, or with replacement
        otherwise.
        :param size: integer, the requested size
        :return: an instance of :class:`LabelledCollection` with length == `size`
        """
        unif_index = self.uniform_sampling_index(size)
        return self.sampling_from_index(unif_index)
    def sampling_from_index(self, index):
        """
        Returns an instance of :class:`LabelledCollection` whose elements are sampled from this collection using the
        index.
        :param index: np.ndarray
        :return: an instance of :class:`LabelledCollection`
        """
        documents = self.instances[index]
        labels = self.labels[index]
        return LabelledCollection(documents, labels, classes_=self.classes_)
    def split_stratified(self, train_prop=0.6, random_state=None):
-        # with temp_seed(42):
+        """
        Returns two instances of :class:`LabelledCollection` split with stratification from this collection, at desired
        proportion.
        :param train_prop: the proportion of elements to include in the left-most returned collection (typically used
            as the training collection). The rest of elements are included in the right-most returned collection
            (typically used as a test collection).
        :param random_state: if specified, guarantees reproducibility of the split.
        :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
            second one with `1-train_prop` elements
        """
        tr_docs, te_docs, tr_labels, te_labels = \
            train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
                             random_state=random_state)
        return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
    def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
        """
        A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
        a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
        prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
        [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
        combination of prevalence values is indicated by `repeats`
        :param sample_size: the number of instances in each sample
        :param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
            limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
        :param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
        :return: yield samples generated at artificially controlled prevalence values
        """
        dimensions = self.n_classes
        for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
            yield self.sampling(sample_size, *prevs)
    def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
        """
        A generator of sample indexes implementing the artificial prevalence protocol (APP).
        The APP consists of exploring
        a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
        prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
        [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of sample indexes for each valid
        combination of prevalence values is indicated by `repeats`
        :param sample_size: the number of instances in each sample (i.e., length of each index)
        :param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
            limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
        :param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
        :return: yield the indexes that generate the samples according to APP
        """
        dimensions = self.n_classes
        for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
            yield self.sampling_index(sample_size, *prevs)
    def natural_sampling_generator(self, sample_size, repeats=100):
        """
        A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
        samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
        :param sample_size: integer, the number of instances in each sample
        :param repeats: the number of samples to generate
        :return: yield instances of :class:`LabelledCollection`
        """
        for _ in range(repeats):
            yield self.uniform_sampling(sample_size)
    def natural_sampling_index_generator(self, sample_size, repeats=100):
        """
        A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
        samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
        :param sample_size: integer, the number of instances in each sample (i.e., the length of each index)
        :param repeats: the number of indexes to generate
        :return: yield `repeats` instances of np.ndarray with shape `(sample_size,)`
        """
        for _ in range(repeats):
            yield self.uniform_sampling_index(sample_size)
    def __add__(self, other):
        """
        Returns a new :class:`LabelledCollection` as the union of this collection with another collection
        :param other: another :class:`LabelledCollection`
        :return: a :class:`LabelledCollection` representing the union of both collections
        """
        if other is None:
            return self
        elif issparse(self.instances) and issparse(other.instances):
@ -149,9 +295,29 @@ class LabelledCollection:
    @property
    def Xy(self):
        """
        Gets the instances and labels. This is useful when working with `sklearn` estimators, e.g.:
        >>> svm = LinearSVC().fit(*my_collection.Xy)
        :return: a tuple `(instances, labels)` from this collection
        """
        return self.instances, self.labels
    def stats(self, show=True):
        """
        Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:
        >>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
        >>> data.training.stats()
        >>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]
        :param show: if set to True (default), prints the stats in standard output
        :return: a dictionary containing some stats of this collection. Keys include `#instances` (the number of
            instances), `type` (the type representing the instances), `#features` (the number of features, if the
            instances are in array-like format), `#classes` (the classes of the collection), `prevs` (the prevalence
            values for each class)
        """
        ninstances = len(self)
        instance_type = type(self.instances[0])
        if instance_type == list:
@ -171,6 +337,14 @@ class LabelledCollection:
        return stats_
    def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
        """
        Generator of stratified folds to be used in k-fold cross validation.
        :param nfolds: integer (default 5), the number of folds to generate
        :param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
        :param random_state: integer (default 0), guarantees that the folds generated are reproducible
        :return: yields `nfolds * nrepeats` folds for k-fold cross validation
        """
        kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
        for train_index, test_index in kf.split(*self.Xy):
            train = self.sampling_from_index(train_index)
@ -178,8 +352,15 @@ class LabelledCollection:
            yield train, test
 class Dataset:
    """
    Abstraction of training and test :class:`LabelledCollection` objects.
    :param training: a :class:`LabelledCollection` instance
    :param test: a :class:`LabelledCollection` instance
    :param vocabulary: if indicated, is a dictionary of the terms used in this textual dataset
    :param name: a string representing the name of the dataset
    """
    def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
        assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
@ -190,45 +371,118 @@ class Dataset:
    @classmethod
    def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
        """
        Generates a :class:`Dataset` from a stratified split of a :class:`LabelledCollection` instance.
        See :meth:`LabelledCollection.split_stratified`
        :param collection: :class:`LabelledCollection`
        :param train_size: the proportion of training documents (the rest conforms the test split)
        :return: an instance of :class:`Dataset`
        """
        return Dataset(*collection.split_stratified(train_prop=train_size))
    @property
    def classes_(self):
        """
        The classes according to which the training collection is labelled
        :return: The classes according to which the training collection is labelled
        """
        return self.training.classes_
    @property
    def n_classes(self):
        """
        The number of classes according to which the training collection is labelled
        :return: integer
        """
        return self.training.n_classes
    @property
    def binary(self):
        """
        Returns True if the training collection is labelled according to two classes
        :return: boolean
        """
        return self.training.binary
    @classmethod
-    def load(cls, train_path, test_path, loader_func: callable):
+    def load(cls, train_path, test_path, loader_func: callable, classes=None, **loader_kwargs):
-        training = LabelledCollection.load(train_path, loader_func)
+        """
-        test = LabelledCollection.load(test_path, loader_func)
+        Loads a training and a test labelled set of data and convert it into a :class:`Dataset` instance.
        The function in charge of reading the instances must be specified. This function can be a custom one, or any of
        the reading functions defined in :mod:`quapy.data.reader` module.
        :param train_path: string, the path to the file containing the training instances
        :param test_path: string, the path to the file containing the test instances
        :param loader_func: a custom function that implements the data loader and returns a tuple with instances and
            labels
        :param classes: array-like, the classes according to which the instances are labelled
        :param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances.
            See :meth:`LabelledCollection.load` for further details.
        :return: a :class:`Dataset` object
        """
        training = LabelledCollection.load(train_path, loader_func, classes, **loader_kwargs)
        test = LabelledCollection.load(test_path, loader_func, classes, **loader_kwargs)
        return Dataset(training, test)
    @property
    def vocabulary_size(self):
        """
        If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary
        :return: integer
        """
        return len(self.vocabulary)
-    def stats(self):
+    def stats(self, show):
        """
        Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:
        >>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
        >>> data.stats()
        >>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]
        :param show: if set to True (default), prints the stats in standard output
        :return: a dictionary containing some stats of this collection for the training and test collections. The keys
            are `train` and `test`, and point to dedicated dictionaries of stats, for each collection, with keys
            `#instances` (the number of instances), `type` (the type representing the instances),
            `#features` (the number of features, if the instances are in array-like format), `#classes` (the classes of
            the collection), `prevs` (the prevalence values for each class)
        """
        tr_stats = self.training.stats(show=False)
        te_stats = self.test.stats(show=False)
-        print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
+        if show:
-              f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
+            print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
-              f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
+                  f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
                  f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
        return {'train': tr_stats, 'test': te_stats}
    @classmethod
    def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
        """
        Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
        :meth:`LabelledCollection.kFCV` that returns :class:`Dataset` instances made of training and test folds.
        :param nfolds: integer (default 5), the number of folds to generate
        :param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
        :param random_state: integer (default 0), guarantees that the folds generated are reproducible
        :return: yields `nfolds * nrepeats` folds for k-fold cross validation as instances of :class:`Dataset`
        """
        for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
            yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
 def isbinary(data):
    """
    Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
    :param data: a :class:`Dataset` or a :class:`LabelledCollection` object
    :return: True if labelled according to two classes
    """
    if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
        return data.binary
    return False
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -5,9 +5,6 @@ warnings.warn = warn
 import os
 import zipfile
 from os.path import join
 from urllib.error import HTTPError
 from sklearn.model_selection import StratifiedKFold
 import pandas as pd
 from quapy.data.base import Dataset, LabelledCollection
@ -49,18 +46,20 @@ UCI_DATASETS = ['acute.a', 'acute.b',
 def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
-    Load a Reviews dataset as a Dataset instance, as used in:
+    Loads a Reviews dataset as a Dataset instance, as used in
-    Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
+    `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
-    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
+    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
    The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
    :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
    :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
    :param min_df: minimun number of documents that should contain a term in order for the term to be
-    kept (ignored if tfidf==False)
+        kept (ignored if tfidf==False)
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-    ~/quay_data/ directory)
+        ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-    faster subsequent invokations
+        faster subsequent invokations
-    :return: a Dataset instance
+    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
@ -93,22 +92,25 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
 def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
-    Load a Twitter dataset as a Dataset instance, as used in:
+    Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
-    Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
+    `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
-    Social Network Analysis and Mining6(19), 1–22 (2016)
+    Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
-    The datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
+    Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
    The list of valid dataset names corresponding to training sets can be accessed in
    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`
    :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
-    'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
+        'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
    :param for_model_selection: if True, then returns the train split as the training set and the devel split
-    as the test set; if False, then returns the train+devel split as the training set and the test set as the
+        as the test set; if False, then returns the train+devel split as the training set and the test set as the
-    test set
+        test set
    :param min_df: minimun number of documents that should contain a term in order for the term to be kept
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-    ~/quay_data/ directory)
+        ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-    faster subsequent invokations
+        faster subsequent invokations
-    :return: a Dataset instance
+    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
@ -163,11 +165,58 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
 def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
    """
    Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
    and
    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
    Dynamic ensemble selection for quantification tasks.
    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
    information on how to use these collections), and so a train-test split is generated at desired proportion.
    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
 def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
    """
    Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
    and
    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
    Dynamic ensemble selection for quantification tasks.
    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
    >>> import quapy as qp
    >>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
    >>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
    >>>     ...
    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in UCI_DATASETS, \
        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
@ -302,7 +351,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
        df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
-        [df_replace(df, col) for col in range(1, 6)]
+        [_df_replace(df, col) for col in range(1, 6)]
        X = df.loc[:, 0:5].values
        if dataset_name == 'acute.a':
            y = binarize(df[6], pos_class='yes')
@ -482,5 +531,5 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
    return data
-def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
+def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
--- a/quapy/data/preprocessing.py
+++ b/quapy/data/preprocessing.py
@ -12,14 +12,18 @@ from .base import LabelledCollection
 def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
    """
-    Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
+    Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of
-    :param dataset: a Dataset where the instances are lists of str
+    tfidf weighted sparse vectors
-    :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
+
-    :param sublinear_tf: whether or not to apply the log scalling to the tf counters
+    :param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are
-    :param inplace: whether or not to apply the transformation inplace, or to a new copy
+        lists of str
-    :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
+    :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)
-    :return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
+    :param sublinear_tf: whether or not to apply the log scalling to the tf counters (default True)
-    where the instances are stored in a csr_matrix of real-valued tfidf scores
+    :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
    :param kwargs: the rest of parameters of the transformation (as for sklearn's
        `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_)
    :return: a new :class:`quapy.data.base.Dataset` in `csr_matrix` format (if inplace=False) or a reference to the
        current Dataset (if inplace=True) where the instances are stored in a `csr_matrix` of real-valued tfidf scores
    """
    __check_type(dataset.training.instances, np.ndarray, str)
    __check_type(dataset.test.instances, np.ndarray, str)
@ -41,13 +45,17 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
 def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
    """
-    Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
+    Reduces the dimensionality of the instances, represented as a `csr_matrix` (or any subtype of
-    _min_df_ instances
+    `scipy.sparse.spmatrix`), of training and test documents by removing the columns of words which are not present
-    :param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
+    in at least `min_df` instances in the training set
-    :param min_df: minimum number of instances below which the columns are removed
+
-    :param inplace: whether or not to apply the transformation inplace, or to a new copy
+    :param dataset: a :class:`quapy.data.base.Dataset` in which instances are represented in sparse format (any
-    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
+        subtype of scipy.sparse.spmatrix)
-    where the dimensions corresponding to infrequent instances have been removed
+    :param min_df: integer, minimum number of instances below which the columns are removed
    :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
    :return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
        :class:`quapy.data.base.Dataset` (inplace=True) where the dimensions corresponding to infrequent terms
        in the training set have been removed
    """
    __check_type(dataset.training.instances, spmatrix)
    __check_type(dataset.test.instances, spmatrix)
@ -71,7 +79,17 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
        return Dataset(training, test)
-def standardize(dataset: Dataset, inplace=True):
+def standardize(dataset: Dataset, inplace=False):
    """
    Standardizes the real-valued columns of a :class:`quapy.data.base.Dataset`.
    Standardization, aka z-scoring, of a variable `X` comes down to subtracting the average and normalizing by the
    standard deviation.
    :param dataset: a :class:`quapy.data.base.Dataset` object
    :param inplace: set to True if the transformation is to be applied inplace, or to False (default) if a new
        :class:`quapy.data.base.Dataset` is to be returned
    :return:
    """
    s = StandardScaler(copy=not inplace)
    training = s.fit_transform(dataset.training.instances)
    test = s.transform(dataset.test.instances)
@ -83,14 +101,18 @@ def standardize(dataset: Dataset, inplace=True):
 def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
    """
-    Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
+    Indexes the tokens of a textual :class:`quapy.data.base.Dataset` of string documents.
-    Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
+    To index a document means to replace each different token by a unique numerical index.
-    :param dataset: a Dataset where the instances are lists of str
+    Rare words (i.e., words occurring less than `min_df` times) are replaced by a special token `UNK`
-    :param min_df: minimum number of instances below which the term is replaced by a UNK index
+
-    :param inplace: whether or not to apply the transformation inplace, or to a new copy
+    :param dataset: a :class:`quapy.data.base.Dataset` object where the instances of training and test documents
-    :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
+        are lists of str
-    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
+    :param min_df: minimum number of occurrences below which the term is replaced by a `UNK` index
-    consisting of lists of integer values representing indices.
+    :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
    :param kwargs: the rest of parameters of the transformation (as for sklearn's
    `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_`)
    :return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
        :class:`quapy.data.base.Dataset` (inplace=True) consisting of lists of integer values representing indices.
    """
    __check_type(dataset.training.instances, np.ndarray, str)
    __check_type(dataset.test.instances, np.ndarray, str)
@ -120,17 +142,23 @@ def __check_type(container, container_type=None, element_type=None):
 class IndexTransformer:
    """
    This class implements a sklearn's-style transformer that indexes text as numerical ids for the tokens it
    contains, and that would be generated by sklearn's
    `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
    :param kwargs: keyworded arguments from `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
    """
    def __init__(self, **kwargs):
        """
        :param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
        """
        self.vect = CountVectorizer(**kwargs)
        self.unk = -1  # a valid index is assigned after fit
        self.pad = -2  # a valid index is assigned after fit
    def fit(self, X):
        """
        Fits the transformer, i.e., decides on the vocabulary, given a list of strings.
        :param X: a list of strings
        :return: self
        """
@ -142,22 +170,52 @@ class IndexTransformer:
        return self
    def transform(self, X, n_jobs=-1):
        """
        Transforms the strings in `X` as lists of numerical ids
        :param X: a list of strings
        :param n_jobs: the number of parallel workers to carry out this task
        :return: a `np.ndarray` of numerical ids
        """
        # given the number of tasks and the number of jobs, generates the slices for the parallel processes
        assert self.unk != -1, 'transform called before fit'
-        indexed = map_parallel(func=self.index, args=X, n_jobs=n_jobs)
+        indexed = map_parallel(func=self._index, args=X, n_jobs=n_jobs)
        return np.asarray(indexed)
-    def index(self, documents):
+    def _index(self, documents):
        vocab = self.vocabulary_.copy()
        return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
    def fit_transform(self, X, n_jobs=-1):
        """
        Fits the transform on `X` and transforms it.
        :param X: a list of strings
        :param n_jobs: the number of parallel workers to carry out this task
        :return: a `np.ndarray` of numerical ids
        """
        return self.fit(X).transform(X, n_jobs=n_jobs)
    def vocabulary_size(self):
        """
        Gets the length of the vocabulary according to which the document tokens have been indexed
        :return: integer
        """
        return len(self.vocabulary_)
    def add_word(self, word, id=None, nogaps=True):
        """
        Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
        Useful to define special tokens for codifying unknown words, or padding tokens.
        :param word: string, surface form of the token
        :param id: integer, numerical value to assign to the token (leave as None for indicating the next valid id,
            default)
        :param nogaps: if set to True (default) asserts that the id indicated leads to no numerical gaps with
            precedent ids stored so far
        :return: integer, the numerical id for the new token
        """
        if word in self.vocabulary_:
            raise ValueError(f'word {word} already in dictionary')
        if id is None:
--- a/quapy/data/reader.py
+++ b/quapy/data/reader.py
@ -7,7 +7,10 @@ def from_text(path, encoding='utf-8', verbose=1, class2int=True):
    """
    Reads a labelled colletion of documents.
    File fomart <0 or 1>\t<document>\n
    :param path: path to the labelled collection
    :param encoding: the text encoding used to open the file
    :param verbose: if >0 (default) shows some progress information in standard output
    :return: a list of sentences, and a list of labels
    """
    all_sentences, all_labels = [], []
@ -35,8 +38,9 @@ def from_sparse(path):
    """
    Reads a labelled collection of real-valued instances expressed in sparse format
    File format <-1 or 0 or 1>[\s col(int):val(float)]\n
    :param path: path to the labelled collection
-    :return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
+    :return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels
    """
    def split_col_val(col_val):
@ -68,8 +72,10 @@ def from_csv(path, encoding='utf-8'):
    """
    Reads a csv file in which columns are separated by ','.
    File format <label>,<feat1>,<feat2>,...,<featn>\n
    :param path: path to the csv file
-    :return: a ndarray for the labels and a ndarray (float) for the covariates
+    :param encoding: the text encoding used to open the file
    :return: a np.ndarray for the labels and a ndarray (float) for the covariates
    """
    X, y = [], []
@ -85,11 +91,16 @@ def from_csv(path, encoding='utf-8'):
 def reindex_labels(y):
    """
    Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
-    E.g., y=['B', 'B', 'A', 'C'] -> [1,1,0,2], ['A','B','C']
+    E.g.:
    >>> reindex_labels(['B', 'B', 'A', 'C'])
    >>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))
    :param y: the list or array of original labels
    :return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.
    """
-    classnames = sorted(np.unique(y))
+    y = np.asarray(y)
    classnames = np.asarray(sorted(np.unique(y)))
    label2index = {label: index for index, label in enumerate(classnames)}
    indexed = np.empty(y.shape, dtype=np.int)
    for label in classnames:
@ -98,6 +109,17 @@ def reindex_labels(y):
 def binarize(y, pos_class):
    """
    Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,:
    >>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)
    >>> array([0, 1, 0, 0, 0, 0])
    :param y: array-like of labels
    :param pos_class: integer, the positive class
    :return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and
        0 otherwise
    """
    y = np.asarray(y)
    ybin = np.zeros(y.shape, dtype=np.int)
    ybin[y == pos_class] = 1