updating the documentation

2021-12-06 18:25:47 +01:00 · 2021-12-06 18:25:47 +01:00 · 2bd47f0841
parent 1f591ec105
commit 2bd47f0841
9 changed files with 1197 additions and 218 deletions
--- a/docs/build/html/genindex.html
+++ b/docs/build/html/genindex.html
@ -255,12 +255,8 @@
        <li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.device">(quapy.method.neural.QuaNetModule property)</a>
 </li>
      </ul></li>
-      <li><a href="quapy.data.html#quapy.data.datasets.df_replace">df_replace() (in module quapy.data.datasets)</a>
-</li>
      <li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.dimensions">dimensions() (quapy.classification.neural.TextClassifierNet method)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.classification.html#quapy.classification.neural.CNNnet.document_embedding">document_embedding() (quapy.classification.neural.CNNnet method)</a>

      <ul>
@ -269,6 +265,8 @@
        <li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.document_embedding">(quapy.classification.neural.TextClassifierNet method)</a>
 </li>
      </ul></li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.html#quapy.util.download_file">download_file() (in module quapy.util)</a>
 </li>
      <li><a href="quapy.html#quapy.util.download_file_if_not_exists">download_file_if_not_exists() (in module quapy.util)</a>
@ -462,19 +460,15 @@
 <table style="width: 100%" class="indextable genindextable"><tr>
  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.data.html#quapy.data.preprocessing.index">index() (in module quapy.data.preprocessing)</a>
-
-      <ul>
-        <li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer.index">(quapy.data.preprocessing.IndexTransformer method)</a>
 </li>
-      </ul></li>
      <li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer">IndexTransformer (class in quapy.data.preprocessing)</a>
 </li>
      <li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.init_hidden">init_hidden() (quapy.method.neural.QuaNetModule method)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.method.html#quapy.method.base.isaggregative">isaggregative() (in module quapy.method.base)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
      <li><a href="quapy.html#quapy.isbinary">isbinary() (in module quapy)</a>

      <ul>
--- a/docs/build/html/objects.inv
+++ b/docs/build/html/objects.inv
--- a/docs/build/html/quapy.data.html
+++ b/docs/build/html/quapy.data.html
@ -63,45 +63,144 @@
 <dt class="sig sig-object py" id="quapy.data.base.Dataset">
 <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">Dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">training</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">vocabulary</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset" title="Permalink to this definition">¶</a></dt>
 <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
+<p>Abstraction of training and test <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> objects.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>training</strong> – a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
+<li><p><strong>test</strong> – a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
+<li><p><strong>vocabulary</strong> – if indicated, is a dictionary of the terms used in this textual dataset</p></li>
+<li><p><strong>name</strong> – a string representing the name of the dataset</p></li>
+</ul>
+</dd>
+</dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.SplitStratified">
 <em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">SplitStratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">collection</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">train_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.SplitStratified" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Generates a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> from a stratified split of a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance.
+See <a class="reference internal" href="#quapy.data.base.LabelledCollection.split_stratified" title="quapy.data.base.LabelledCollection.split_stratified"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.split_stratified()</span></code></a></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>collection</strong> – <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p></li>
+<li><p><strong>train_size</strong> – the proportion of training documents (the rest conforms the test split)</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.binary">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.Dataset.binary" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns True if the training collection is labelled according to two classes</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>boolean</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.classes_">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.data.base.Dataset.classes_" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>The classes according to which the training collection is labelled</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>The classes according to which the training collection is labelled</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.kFCV">
 <em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.kFCV" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
+<a class="reference internal" href="#quapy.data.base.LabelledCollection.kFCV" title="quapy.data.base.LabelledCollection.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.kFCV()</span></code></a> that returns <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instances made of training and test folds.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>nfolds</strong> – integer (default 5), the number of folds to generate</p></li>
+<li><p><strong>nrepeats</strong> – integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
+<li><p><strong>random_state</strong> – integer (default 0), guarantees that the folds generated are reproducible</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation as instances of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.load">
-<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition">¶</a></dt>
+<dd><p>Loads a training and a test labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instance.
+The function in charge of reading the instances must be specified. This function can be a custom one, or any of
+the reading functions defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>train_path</strong> – string, the path to the file containing the training instances</p></li>
+<li><p><strong>test_path</strong> – string, the path to the file containing the test instances</p></li>
+<li><p><strong>loader_func</strong> – a custom function that implements the data loader and returns a tuple with instances and
+labels</p></li>
+<li><p><strong>classes</strong> – array-like, the classes according to which the instances are labelled</p></li>
+<li><p><strong>loader_kwargs</strong> – any argument that the <cite>loader_func</cite> function needs in order to read the instances.
+See <a class="reference internal" href="#quapy.data.base.LabelledCollection.load" title="quapy.data.base.LabelledCollection.load"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.load()</span></code></a> for further details.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.n_classes">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.Dataset.n_classes" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>The number of classes according to which the training collection is labelled</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>integer</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.stats">
-<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition">¶</a></dt>
+<dd><p>Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">Dataset</span><span class="o">=</span><span class="n">kindle</span> <span class="c1">#tr-instances=3821, #te-instances=21591, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]</span>
+</pre></div>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>show</strong> – if set to True (default), prints the stats in standard output</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a dictionary containing some stats of this collection for the training and test collections. The keys
+are <cite>train</cite> and <cite>test</cite>, and point to dedicated dictionaries of stats, for each collection, with keys
+<cite>#instances</cite> (the number of instances), <cite>type</cite> (the type representing the instances),
+<cite>#features</cite> (the number of features, if the instances are in array-like format), <cite>#classes</cite> (the classes of
+the collection), <cite>prevs</cite> (the prevalence values for each class)</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.Dataset.vocabulary_size">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">vocabulary_size</span></span><a class="headerlink" href="#quapy.data.base.Dataset.vocabulary_size" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>integer</p>
+</dd>
+</dl>
+</dd></dl>

 </dd></dl>

@ -109,161 +208,480 @@
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection">
 <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">LabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection" title="Permalink to this definition">¶</a></dt>
 <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
-<p>A LabelledCollection is a set of objects each with a label associated to it.</p>
+<p>A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
+routines.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>instances</strong> – array-like (np.ndarray, list, or csr_matrix are supported)</p></li>
+<li><p><strong>labels</strong> – array-like with the same length of instances</p></li>
+<li><p><strong>classes</strong> – optional, list of classes from which labels are taken. If not specified, the classes are inferred
+from the labels. The classes must be indicated in cases in which some of the labels might have no examples
+(i.e., a prevalence of 0)</p></li>
+</ul>
+</dd>
+</dl>
 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.Xy">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">Xy</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.Xy" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Gets the instances and labels. This is useful when working with <cite>sklearn</cite> estimators, e.g.:</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span> <span class="o">=</span> <span class="n">LinearSVC</span><span class="p">()</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="o">*</span><span class="n">my_collection</span><span class="o">.</span><span class="n">Xy</span><span class="p">)</span>
+</pre></div>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>a tuple <cite>(instances, labels)</cite> from this collection</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_generator">
 <span class="sig-name descname"><span class="pre">artificial_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_generator" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
+a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
+prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
+[1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of samples for each valid
+combination of prevalence values is indicated by <cite>repeats</cite></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>sample_size</strong> – the number of instances in each sample</p></li>
+<li><p><strong>n_prevalences</strong> – the number of prevalence points to be taken from the [0,1] interval (including the
+limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
+<li><p><strong>repeats</strong> – the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>yield samples generated at artificially controlled prevalence values</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_index_generator">
 <span class="sig-name descname"><span class="pre">artificial_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_index_generator" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>A generator of sample indexes implementing the artificial prevalence protocol (APP).
+The APP consists of exploring
+a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
+prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
+[1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of sample indexes for each valid
+combination of prevalence values is indicated by <cite>repeats</cite></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>sample_size</strong> – the number of instances in each sample (i.e., length of each index)</p></li>
+<li><p><strong>n_prevalences</strong> – the number of prevalence points to be taken from the [0,1] interval (including the
+limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
+<li><p><strong>repeats</strong> – the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>yield the indexes that generate the samples according to APP</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.binary">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.binary" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns True if the number of classes is 2</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>boolean</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.counts">
 <span class="sig-name descname"><span class="pre">counts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.counts" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns the number of instances for each of the classes of interest.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the number of instances of each class, in the same order
+as listed by <cite>self.classes_</cite></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.kFCV">
 <span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.kFCV" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Generator of stratified folds to be used in k-fold cross validation.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>nfolds</strong> – integer (default 5), the number of folds to generate</p></li>
+<li><p><strong>nrepeats</strong> – integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
+<li><p><strong>random_state</strong> – integer (default 0), guarantees that the folds generated are reproducible</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.load">
-<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition">¶</a></dt>
+<dd><p>Loads a labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance. The function in charge
+of reading the instances must be specified. This function can be a custom one, or any of the reading functions
+defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>path</strong> – string, the path to the file containing the labelled instances</p></li>
+<li><p><strong>loader_func</strong> – a custom function that implements the data loader and returns a tuple with instances and
+labels</p></li>
+<li><p><strong>classes</strong> – array-like, the classes according to which the instances are labelled</p></li>
+<li><p><strong>loader_kwargs</strong> – any argument that the <cite>loader_func</cite> function needs in order to read the instances, i.e.,
+these arguments are used to call <cite>loader_func(path, **loader_kwargs)</cite></p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py property">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.n_classes">
 <em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.n_classes" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>The number of classes</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>integer</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_generator">
 <span class="sig-name descname"><span class="pre">natural_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_generator" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
+samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>sample_size</strong> – integer, the number of instances in each sample</p></li>
+<li><p><strong>repeats</strong> – the number of samples to generate</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>yield instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_index_generator">
 <span class="sig-name descname"><span class="pre">natural_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_index_generator" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
+samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>sample_size</strong> – integer, the number of instances in each sample (i.e., the length of each index)</p></li>
+<li><p><strong>repeats</strong> – the number of indexes to generate</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>yield <cite>repeats</cite> instances of np.ndarray with shape <cite>(sample_size,)</cite></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.prevalence">
 <span class="sig-name descname"><span class="pre">prevalence</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.prevalence" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns the prevalence, or relative frequency, of the classes of interest.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the relative frequencies of each class, in the same order
+as listed by <cite>self.classes_</cite></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling">
 <span class="sig-name descname"><span class="pre">sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Return a random sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size and desired prevalence
+values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
+the actual prevalence of the class, or with replacement otherwise.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>size</strong> – integer, the requested size</p></li>
+<li><p><strong>prevs</strong> – the prevalence for each class; the prevalence value for the last class can be lead empty since
+it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
+<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
+<li><p><strong>shuffle</strong> – if set to True (default), shuffles the index before returning it</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite> and prevalence close to <cite>prevs</cite> (or
+prevalence == <cite>prevs</cite> if the exact prevalence values can be met as proportions of instances)</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_from_index">
 <span class="sig-name descname"><span class="pre">sampling_from_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">index</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_from_index" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> whose elements are sampled from this collection using the
+index.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>index</strong> – np.ndarray</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_index">
 <span class="sig-name descname"><span class="pre">sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_index" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
+prevalence values are not specified, then returns the index of a uniform sampling.
+For each class, the sampling is drawn without replacement if the requested prevalence is larger than
+the actual prevalence of the class, or with replacement otherwise.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>size</strong> – integer, the requested size</p></li>
+<li><p><strong>prevs</strong> – the prevalence for each class; the prevalence value for the last class can be lead empty since
+it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
+<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
+<li><p><strong>shuffle</strong> – if set to True (default), shuffles the index before returning it</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.split_stratified">
 <span class="sig-name descname"><span class="pre">split_stratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_prop</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.split_stratified" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> split with stratification from this collection, at desired
+proportion.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>train_prop</strong> – the proportion of elements to include in the left-most returned collection (typically used
+as the training collection). The rest of elements are included in the right-most returned collection
+(typically used as a test collection).</p></li>
+<li><p><strong>random_state</strong> – if specified, guarantees reproducibility of the split.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>, the first one with <cite>train_prop</cite> elements, and the
+second one with <cite>1-train_prop</cite> elements</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.stats">
 <span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.stats" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">training</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="c1">#instances=3821, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]</span>
+</pre></div>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>show</strong> – if set to True (default), prints the stats in standard output</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a dictionary containing some stats of this collection. Keys include <cite>#instances</cite> (the number of
+instances), <cite>type</cite> (the type representing the instances), <cite>#features</cite> (the number of features, if the
+instances are in array-like format), <cite>#classes</cite> (the classes of the collection), <cite>prevs</cite> (the prevalence
+values for each class)</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling">
 <span class="sig-name descname"><span class="pre">uniform_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns a uniform sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size. The sampling is drawn
+without replacement if the requested size is greater than the number of instances, or with replacement
+otherwise.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>size</strong> – integer, the requested size</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling_index">
 <span class="sig-name descname"><span class="pre">uniform_sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling_index" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
+without replacement if the requested size is greater than the number of instances, or with replacement
+otherwise.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>size</strong> – integer, the size of the uniform sample</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
+</dd>
+</dl>
+</dd></dl>

 </dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.base.isbinary">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">isbinary</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.isbinary" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Returns True if <cite>data</cite> is either a binary <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a binary <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>data</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>True if labelled according to two classes</p>
+</dd>
+</dl>
+</dd></dl>

 </section>
 <section id="module-quapy.data.datasets">
 <span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Permalink to this headline">¶</a></h2>
-<dl class="py function">
-<dt class="sig sig-object py" id="quapy.data.datasets.df_replace">
-<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">df_replace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">df</span></em>, <em class="sig-param"><span class="pre">col</span></em>, <em class="sig-param"><span class="pre">repl={'no':</span> <span class="pre">0</span></em>, <em class="sig-param"><span class="pre">'yes':</span> <span class="pre">1}</span></em>, <em class="sig-param"><span class="pre">astype=&lt;class</span> <span class="pre">'float'&gt;</span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.datasets.df_replace" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
-
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIDataset">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIDataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_split</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCIDataset" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Loads a UCI dataset as an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>, as used in
+<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
+Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
+Information Fusion, 34, 87-100.</a>
+and
+<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
+Dynamic ensemble selection for quantification tasks.
+Information Fusion, 45, 1-15.</a>.
+The datasets do not come with a predefined train-test split (see <a class="reference internal" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="quapy.data.datasets.fetch_UCILabelledCollection"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fetch_UCILabelledCollection()</span></code></a> for further
+information on how to use these collections), and so a train-test split is generated at desired proportion.
+The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dataset_name</strong> – a dataset name</p></li>
+<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
+~/quay_data/ directory)</p></li>
+<li><p><strong>test_split</strong> – proportion of documents to be included in the test set. The rest conforms the training set</p></li>
+<li><p><strong>verbose</strong> – set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCILabelledCollection">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCILabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Loads a UCI collection as an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, as used in
+<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
+Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
+Information Fusion, 34, 87-100.</a>
+and
+<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
+Dynamic ensemble selection for quantification tasks.
+Information Fusion, 45, 1-15.</a>.
+The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
+protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
+This can be reproduced by using <a class="reference internal" href="#quapy.data.base.Dataset.kFCV" title="quapy.data.base.Dataset.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.data.base.Dataset.kFCV()</span></code></a>, e.g.:</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">collection</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCILabelledCollection</span><span class="p">(</span><span class="s2">&quot;yeast&quot;</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">data</span> <span class="ow">in</span> <span class="n">qp</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">Dataset</span><span class="o">.</span><span class="n">kFCV</span><span class="p">(</span><span class="n">collection</span><span class="p">,</span> <span class="n">nfolds</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">nrepeats</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
+<span class="gp">&gt;&gt;&gt; </span>    <span class="o">...</span>
+</pre></div>
+</div>
+<p>The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dataset_name</strong> – a dataset name</p></li>
+<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
+~/quay_data/ directory)</p></li>
+<li><p><strong>test_split</strong> – proportion of documents to be included in the test set. The rest conforms the training set</p></li>
+<li><p><strong>verbose</strong> – set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.fetch_reviews">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_reviews</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tfidf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_reviews" title="Permalink to this definition">¶</a></dt>
-<dd><p>Load a Reviews dataset as a Dataset instance, as used in:
-Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
-Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
-:param dataset_name: the name of the dataset: valid ones are ‘hp’, ‘kindle’, ‘imdb’
-:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
-:param min_df: minimun number of documents that should contain a term in order for the term to be
-kept (ignored if tfidf==False)
-:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-~/quay_data/ directory)
-:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-faster subsequent invokations
-:return: a Dataset instance</p>
+<dd><p>Loads a Reviews dataset as a Dataset instance, as used in
+<a class="reference external" href="https://dl.acm.org/doi/abs/10.1145/3269206.3269287">Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
+Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.</a>.
+The list of valid dataset names can be accessed in <cite>quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS</cite></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘hp’, ‘kindle’, ‘imdb’</p></li>
+<li><p><strong>tfidf</strong> – set to True to transform the raw documents into tfidf weighted matrices</p></li>
+<li><p><strong>min_df</strong> – minimun number of documents that should contain a term in order for the term to be
+kept (ignored if tfidf==False)</p></li>
+<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
+~/quay_data/ directory)</p></li>
+<li><p><strong>pickle</strong> – set to True to pickle the Dataset object the first time it is generated, in order to allow for
+faster subsequent invokations</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.datasets.fetch_twitter">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_twitter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_twitter" title="Permalink to this definition">¶</a></dt>
-<dd><p>Load a Twitter dataset as a Dataset instance, as used in:
-Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
-Social Network Analysis and Mining6(19), 1–22 (2016)
-The datasets ‘semeval13’, ‘semeval14’, ‘semeval15’ share the same training set.</p>
+<dd><p>Loads a Twitter dataset as a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance, as used in:
+<a class="reference external" href="https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf">Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
+Social Network Analysis and Mining6(19), 1–22 (2016)</a>
+Note that the datasets ‘semeval13’, ‘semeval14’, ‘semeval15’ share the same training set.
+The list of valid dataset names corresponding to training sets can be accessed in
+<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN</cite>, while the test sets can be accessed in
+<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST</cite></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
-<dd class="field-odd"><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘gasp’, ‘hcr’, ‘omd’, ‘sanders’, ‘semeval13’,</p>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘gasp’, ‘hcr’, ‘omd’, ‘sanders’, ‘semeval13’,
+‘semeval14’, ‘semeval15’, ‘semeval16’, ‘sst’, ‘wa’, ‘wb’</p></li>
+<li><p><strong>for_model_selection</strong> – if True, then returns the train split as the training set and the devel split
+as the test set; if False, then returns the train+devel split as the training set and the test set as the
+test set</p></li>
+<li><p><strong>min_df</strong> – minimun number of documents that should contain a term in order for the term to be kept</p></li>
+<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
+~/quay_data/ directory)</p></li>
+<li><p><strong>pickle</strong> – set to True to pickle the Dataset object the first time it is generated, in order to allow for
+faster subsequent invokations</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
 </dd>
 </dl>
-<p>‘semeval14’, ‘semeval15’, ‘semeval16’, ‘sst’, ‘wa’, ‘wb’
-:param for_model_selection: if True, then returns the train split as the training set and the devel split
-as the test set; if False, then returns the train+devel split as the training set and the test set as the
-test set
-:param min_df: minimun number of documents that should contain a term in order for the term to be kept
-:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-~/quay_data/ directory)
-:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-faster subsequent invokations
-:return: a Dataset instance</p>
 </dd></dl>

 <dl class="py function">
@ -278,15 +696,41 @@ faster subsequent invokations
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer">
 <em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">IndexTransformer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer" title="Permalink to this definition">¶</a></dt>
 <dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
+<p>This class implements a sklearn’s-style transformer that indexes text as numerical ids for the tokens it
+contains, and that would be generated by sklearn’s
+<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>kwargs</strong> – <p>keyworded arguments from <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
+</p>
+</dd>
+</dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.add_word">
 <span class="sig-name descname"><span class="pre">add_word</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">word</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nogaps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.add_word" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
+Useful to define special tokens for codifying unknown words, or padding tokens.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>word</strong> – string, surface form of the token</p></li>
+<li><p><strong>id</strong> – integer, numerical value to assign to the token (leave as None for indicating the next valid id,
+default)</p></li>
+<li><p><strong>nogaps</strong> – if set to True (default) asserts that the id indicated leads to no numerical gaps with
+precedent ids stored so far</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>integer, the numerical id for the new token</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit">
 <span class="sig-name descname"><span class="pre">fit</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit" title="Permalink to this definition">¶</a></dt>
-<dd><dl class="field-list simple">
+<dd><p>Fits the transformer, i.e., decides on the vocabulary, given a list of strings.</p>
+<dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><p><strong>X</strong> – a list of strings</p>
 </dd>
@ -299,66 +743,139 @@ faster subsequent invokations
 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit_transform">
 <span class="sig-name descname"><span class="pre">fit_transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit_transform" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
-
-<dl class="py method">
-<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.index">
-<span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">documents</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.index" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Fits the transform on <cite>X</cite> and transforms it.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>X</strong> – a list of strings</p></li>
+<li><p><strong>n_jobs</strong> – the number of parallel workers to carry out this task</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.transform">
 <span class="sig-name descname"><span class="pre">transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.transform" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Transforms the strings in <cite>X</cite> as lists of numerical ids</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>X</strong> – a list of strings</p></li>
+<li><p><strong>n_jobs</strong> – the number of parallel workers to carry out this task</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py method">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.vocabulary_size">
 <span class="sig-name descname"><span class="pre">vocabulary_size</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.vocabulary_size" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Gets the length of the vocabulary according to which the document tokens have been indexed</p>
+<dl class="field-list simple">
+<dt class="field-odd">Returns</dt>
+<dd class="field-odd"><p>integer</p>
+</dd>
+</dl>
+</dd></dl>

 </dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.index">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.index" title="Permalink to this definition">¶</a></dt>
-<dd><p>Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
-Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
-:param dataset: a Dataset where the instances are lists of str
-:param min_df: minimum number of instances below which the term is replaced by a UNK index
-:param inplace: whether or not to apply the transformation inplace, or to a new copy
-:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
-:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
-consisting of lists of integer values representing indices.</p>
+<dd><p>Indexes the tokens of a textual <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of string documents.
+To index a document means to replace each different token by a unique numerical index.
+Rare words (i.e., words occurring less than <cite>min_df</cite> times) are replaced by a special token <cite>UNK</cite></p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object where the instances of training and test documents
+are lists of str</p></li>
+<li><p><strong>min_df</strong> – minimum number of occurrences below which the term is replaced by a <cite>UNK</cite> index</p></li>
+<li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
+<li><p><strong>kwargs</strong> – the rest of parameters of the transformation (as for sklearn’s</p></li>
+</ul>
+</dd>
+</dl>
+<p><cite>CountVectorizer &lt;https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html&gt;_</cite>)
+:return: a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current</p>
+<blockquote>
+<div><p><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) consisting of lists of integer values representing indices.</p>
+</div></blockquote>
 </dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.reduce_columns">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">reduce_columns</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.reduce_columns" title="Permalink to this definition">¶</a></dt>
-<dd><p>Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
-_min_df_ instances
-:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
-:param min_df: minimum number of instances below which the columns are removed
-:param inplace: whether or not to apply the transformation inplace, or to a new copy
-:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
-where the dimensions corresponding to infrequent instances have been removed</p>
+<dd><p>Reduces the dimensionality of the instances, represented as a <cite>csr_matrix</cite> (or any subtype of
+<cite>scipy.sparse.spmatrix</cite>), of training and test documents by removing the columns of words which are not present
+in at least <cite>min_df</cite> instances in the training set</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in which instances are represented in sparse format (any
+subtype of scipy.sparse.spmatrix)</p></li>
+<li><p><strong>min_df</strong> – integer, minimum number of instances below which the columns are removed</p></li>
+<li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current
+<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) where the dimensions corresponding to infrequent terms
+in the training set have been removed</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.standardize">
-<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition">¶</a></dt>
+<dd><p>Standardizes the real-valued columns of a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>.
+Standardization, aka z-scoring, of a variable <cite>X</cite> comes down to subtracting the average and normalizing by the
+standard deviation.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object</p></li>
+<li><p><strong>inplace</strong> – set to True if the transformation is to be applied inplace, or to False (default) if a new
+<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> is to be returned</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.preprocessing.text2tfidf">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">text2tfidf</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sublinear_tf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.text2tfidf" title="Permalink to this definition">¶</a></dt>
-<dd><p>Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
-:param dataset: a Dataset where the instances are lists of str
-:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
-:param sublinear_tf: whether or not to apply the log scalling to the tf counters
-:param inplace: whether or not to apply the transformation inplace, or to a new copy
-:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
-:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
-where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
+<dd><p>Transforms a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of textual instances into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of
+tfidf weighted sparse vectors</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> where the instances of training and test collections are
+lists of str</p></li>
+<li><p><strong>min_df</strong> – minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)</p></li>
+<li><p><strong>sublinear_tf</strong> – whether or not to apply the log scalling to the tf counters (default True)</p></li>
+<li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
+<li><p><strong>kwargs</strong> – the rest of parameters of the transformation (as for sklearn’s
+<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html">TfidfVectorizer</a>)</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in <cite>csr_matrix</cite> format (if inplace=False) or a reference to the
+current Dataset (if inplace=True) where the instances are stored in a <cite>csr_matrix</cite> of real-valued tfidf scores</p>
+</dd>
+</dl>
 </dd></dl>

 </section>
@ -367,7 +884,24 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.reader.binarize">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">binarize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pos_class</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.binarize" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Binarizes a categorical array-like collection of labels towards the positive class <cite>pos_class</cite>. E.g.,:</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">binarize</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">array</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">])</span>
+</pre></div>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>y</strong> – array-like of labels</p></li>
+<li><p><strong>pos_class</strong> – integer, the positive class</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a binary np.ndarray, in which values 1 corresponds to positions in whcih <cite>y</cite> had <cite>pos_class</cite> labels, and
+0 otherwise</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.data.reader.from_csv">
@ -376,10 +910,13 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
 File format &lt;label&gt;,&lt;feat1&gt;,&lt;feat2&gt;,…,&lt;featn&gt;</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
-<dd class="field-odd"><p><strong>path</strong> – path to the csv file</p>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>path</strong> – path to the csv file</p></li>
+<li><p><strong>encoding</strong> – the text encoding used to open the file</p></li>
+</ul>
 </dd>
 <dt class="field-even">Returns</dt>
-<dd class="field-even"><p>a ndarray for the labels and a ndarray (float) for the covariates</p>
+<dd class="field-even"><p>a np.ndarray for the labels and a ndarray (float) for the covariates</p>
 </dd>
 </dl>
 </dd></dl>
@ -394,7 +931,7 @@ File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
 <dd class="field-odd"><p><strong>path</strong> – path to the labelled collection</p>
 </dd>
 <dt class="field-even">Returns</dt>
-<dd class="field-even"><p>a csr_matrix containing the instances (rows), and a ndarray containing the labels</p>
+<dd class="field-even"><p>a <cite>csr_matrix</cite> containing the instances (rows), and a ndarray containing the labels</p>
 </dd>
 </dl>
 </dd></dl>
@ -406,7 +943,11 @@ File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
 File fomart &lt;0 or 1&gt;        &lt;document&gt;</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
-<dd class="field-odd"><p><strong>path</strong> – path to the labelled collection</p>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>path</strong> – path to the labelled collection</p></li>
+<li><p><strong>encoding</strong> – the text encoding used to open the file</p></li>
+<li><p><strong>verbose</strong> – if &gt;0 (default) shows some progress information in standard output</p></li>
+</ul>
 </dd>
 <dt class="field-even">Returns</dt>
 <dd class="field-even"><p>a list of sentences, and a list of labels</p>
@ -418,9 +959,19 @@ File fomart &lt;0 or 1&gt;        &lt;document&gt;</p>
 <dt class="sig sig-object py" id="quapy.data.reader.reindex_labels">
 <span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">reindex_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.reindex_labels" title="Permalink to this definition">¶</a></dt>
 <dd><p>Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
-E.g., y=[‘B’, ‘B’, ‘A’, ‘C’] -&gt; [1,1,0,2], [‘A’,’B’,’C’]
-:param y: the list or array of original labels
-:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
+E.g.:</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">reindex_labels</span><span class="p">([</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">])</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">]),</span> <span class="n">array</span><span class="p">([</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">&#39;&lt;U1&#39;</span><span class="p">))</span>
+</pre></div>
+</div>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>y</strong> – the list or array of original labels</p>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
+</dd>
+</dl>
 </dd></dl>

 </section>
--- a/docs/build/html/quapy.html
+++ b/docs/build/html/quapy.html
@ -515,14 +515,20 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.evaluation.artificial_prevalence_prediction">
 <span class="sig-prename descclassname"><span class="pre">quapy.evaluation.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_prediction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.method.html#quapy.method.base.BaseQuantifier" title="quapy.method.base.BaseQuantifier"><span class="pre">quapy.method.base.BaseQuantifier</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.data.html#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevpoints</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">210</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_repetitions</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eval_budget</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">42</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.evaluation.artificial_prevalence_prediction" title="Permalink to this definition">¶</a></dt>
-<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.
-:param model: the model in charge of generating the class prevalence estimations
-:param test: the test set on which to perform arificial sampling
-:param sample_size: the size of the samples
-:param n_prevpoints: the number of different prevalences to sample (or set to None if eval_budget is specified)
-:param n_repetitions: the number of repetitions for each prevalence
-:param eval_budget: if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3
-classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
+<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>model</strong> – the model in charge of generating the class prevalence estimations</p></li>
+<li><p><strong>test</strong> – the test set on which to perform arificial sampling</p></li>
+<li><p><strong>sample_size</strong> – the size of the samples</p></li>
+<li><p><strong>n_prevpoints</strong> – the number of different prevalences to sample (or set to None if eval_budget is specified)</p></li>
+<li><p><strong>n_repetitions</strong> – the number of repetitions for each prevalence</p></li>
+<li><p><strong>eval_budget</strong> – if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3</p></li>
+</ul>
+</dd>
+</dl>
+<p>classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
 different prevalences ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] … [1, 0, 0]) and since setting it n_prevpoints
 to 6 would produce more than 20 evaluations.
 :param n_jobs: number of jobs to be run in parallel
@ -601,7 +607,31 @@ contains the the prevalence estimations</p>
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.artificial_prevalence_sampling">
 <span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dimensions</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_constrained_dim</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.artificial_prevalence_sampling" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<dd><p>Generates vectors of prevalence values artificially drawn from an exhaustive grid of prevalence values. The
+number of prevalence values explored for each dimension depends on <cite>n_prevalences</cite>, so that, if, for example,
+<cite>n_prevalences=11</cite> then the prevalence values of the grid are taken from [0, 0.1, 0.2, …, 0.9, 1]. Only
+valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
+valid vector of prevalence values, <cite>repeat</cite> copies are returned. The vector of prevalence values can be
+implicit (by setting <cite>return_constrained_dim=False</cite>), meaning that the last dimension (which is constrained
+to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to 1).</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>dimensions</strong> – the number of classes</p></li>
+<li><p><strong>n_prevalences</strong> – the number of equidistant prevalence points to extract from the [0,1] interval for the grid
+(default is 21)</p></li>
+<li><p><strong>repeat</strong> – number of copies for each valid prevalence vector (default is 1)</p></li>
+<li><p><strong>return_constrained_dim</strong> – set to True to return all dimensions, or to False (default) for ommitting the
+constrained dimension</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>an ndarray of shape <cite>(n, dimensions)</cite> if <cite>return_constrained_dim=True</cite> or of shape <cite>(n, dimensions-1)</cite>
+if <cite>return_constrained_dim=False</cite>, where <cite>n</cite> is the number of valid combinations found in the grid multiplied
+by <cite>repeat</cite></p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.get_nprevpoints_approximation">
@ -634,8 +664,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.prevalence_from_labels">
-<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition">¶</a></dt>
-<dd></dd></dl>
+<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition">¶</a></dt>
+<dd><p>Computed the prevalence values from a vector of labels.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>labels</strong> – array-like of shape <cite>(n_instances)</cite> with the label for each instance</p></li>
+<li><p><strong>classes</strong> – the class labels. This is needed in order to correctly compute the prevalence vector even when
+some classes have no examples.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>an ndarray of shape <cite>(len(classes))</cite> with the class prevalence values</p>
+</dd>
+</dl>
+</dd></dl>

 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.prevalence_from_probabilities">
@ -645,13 +688,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
 <dl class="py function">
 <dt class="sig sig-object py" id="quapy.functional.prevalence_linspace">
 <span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_linspace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smooth_limits_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.01</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_linspace" title="Permalink to this definition">¶</a></dt>
-<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
-and with the limits smoothed, i.e.:
-[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]
-:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
-:param repeat: number of times each prevalence is to be repeated (defaults to 1)
-:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
-:return: an array of uniformly separated prevalence values</p>
+<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array of 21 prevalence values, with
+step 0.05 and with the limits smoothed, i.e.:
+[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>n_prevalences</strong> – the number of prevalence values to sample from the [0,1] interval (default 21)</p></li>
+<li><p><strong>repeat</strong> – number of times each prevalence is to be repeated (defaults to 1)</p></li>
+<li><p><strong>smooth_limits_epsilon</strong> – the quantity to add and subtract to the limits 0 and 1</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns</dt>
+<dd class="field-even"><p>an array of uniformly separated prevalence values</p>
+</dd>
+</dl>
 </dd></dl>

 <dl class="py function">
--- a/docs/build/html/searchindex.js
+++ b/docs/build/html/searchindex.js
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -1,6 +1,3 @@
-from abc import abstractmethod
-from typing import List, Union
-
 import numpy as np
 from scipy.sparse import issparse
 from scipy.sparse import vstack
@ -9,18 +6,19 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
 from quapy.functional import artificial_prevalence_sampling, strprev


-
 class LabelledCollection:
-    '''
-    A LabelledCollection is a set of objects each with a label associated to it.
-    '''
+    """
+    A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
+    routines.
+
+    :param instances: array-like (np.ndarray, list, or csr_matrix are supported)
+    :param labels: array-like with the same length of instances
+    :param classes_: optional, list of classes from which labels are taken. If not specified, the classes are inferred
+        from the labels. The classes must be indicated in cases in which some of the labels might have no examples
+        (i.e., a prevalence of 0)
+    """

    def __init__(self, instances, labels, classes_=None):
-        """
-        :param instances: list of objects
-        :param labels: list of labels, same length of instances
-        :param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
-        """
        if issparse(instances):
            self.instances = instances
        elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
@ -42,28 +40,81 @@ class LabelledCollection:

    @classmethod
    def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
+        """
+        Loads a labelled set of data and convert it into a :class:`LabelledCollection` instance. The function in charge
+        of reading the instances must be specified. This function can be a custom one, or any of the reading functions
+        defined in :mod:`quapy.data.reader` module.
+
+        :param path: string, the path to the file containing the labelled instances
+        :param loader_func: a custom function that implements the data loader and returns a tuple with instances and
+            labels
+        :param classes: array-like, the classes according to which the instances are labelled
+        :param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances, i.e.,
+            these arguments are used to call `loader_func(path, **loader_kwargs)`
+        :return: a :class:`LabelledCollection` object
+        """
        return LabelledCollection(*loader_func(path, **loader_kwargs), classes)

    def __len__(self):
+        """
+        Returns the length of this collection (number of labelled instances)
+
+        :return: integer
+        """
        return self.instances.shape[0]

    def prevalence(self):
+        """
+        Returns the prevalence, or relative frequency, of the classes of interest.
+
+        :return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order
+            as listed by `self.classes_`
+        """
        return self.counts() / len(self)

    def counts(self):
+        """
+        Returns the number of instances for each of the classes of interest.
+
+        :return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order
+            as listed by `self.classes_`
+        """
        return np.asarray([len(self.index[class_]) for class_ in self.classes_])

    @property
    def n_classes(self):
+        """
+        The number of classes
+
+        :return: integer
+        """
        return len(self.classes_)

    @property
    def binary(self):
+        """
+        Returns True if the number of classes is 2
+
+        :return: boolean
+        """
        return self.n_classes == 2

    def sampling_index(self, size, *prevs, shuffle=True):
+        """
+        Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
+        prevalence values are not specified, then returns the index of a uniform sampling.
+        For each class, the sampling is drawn without replacement if the requested prevalence is larger than
+        the actual prevalence of the class, or with replacement otherwise.
+
+        :param size: integer, the requested size
+        :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
+            it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
+            `self.classes_` can be specified, while the other class takes prevalence value `1-p`
+        :param shuffle: if set to True (default), shuffles the index before returning it
+        :return: a np.ndarray of shape `(size)` with the indexes
+        """
        if len(prevs) == 0:  # no prevalence was indicated; returns an index for uniform sampling
-            return np.random.choice(len(self), size, replace=False)
+            return self.uniform_sampling_index(size)
        if len(prevs) == self.n_classes - 1:
            prevs = prevs + (1 - sum(prevs),)
        assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
@ -93,47 +144,142 @@ class LabelledCollection:
        return indexes_sample

    def uniform_sampling_index(self, size):
+        """
+        Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
+        without replacement if the requested size is greater than the number of instances, or with replacement
+        otherwise.
+
+        :param size: integer, the size of the uniform sample
+        :return: a np.ndarray of shape `(size)` with the indexes
+        """
        return np.random.choice(len(self), size, replace=False)

-    def uniform_sampling(self, size):
-        unif_index = self.uniform_sampling_index(size)
-        return self.sampling_from_index(unif_index)
-
    def sampling(self, size, *prevs, shuffle=True):
+        """
+        Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence
+        values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
+        the actual prevalence of the class, or with replacement otherwise.
+
+        :param size: integer, the requested size
+        :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
+            it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
+            `self.classes_` can be specified, while the other class takes prevalence value `1-p`
+        :param shuffle: if set to True (default), shuffles the index before returning it
+        :return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or
+            prevalence == `prevs` if the exact prevalence values can be met as proportions of instances)
+        """
        prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
        return self.sampling_from_index(prev_index)

+    def uniform_sampling(self, size):
+        """
+        Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
+        without replacement if the requested size is greater than the number of instances, or with replacement
+        otherwise.
+
+        :param size: integer, the requested size
+        :return: an instance of :class:`LabelledCollection` with length == `size`
+        """
+        unif_index = self.uniform_sampling_index(size)
+        return self.sampling_from_index(unif_index)
+
    def sampling_from_index(self, index):
+        """
+        Returns an instance of :class:`LabelledCollection` whose elements are sampled from this collection using the
+        index.
+
+        :param index: np.ndarray
+        :return: an instance of :class:`LabelledCollection`
+        """
        documents = self.instances[index]
        labels = self.labels[index]
        return LabelledCollection(documents, labels, classes_=self.classes_)

    def split_stratified(self, train_prop=0.6, random_state=None):
-        # with temp_seed(42):
+        """
+        Returns two instances of :class:`LabelledCollection` split with stratification from this collection, at desired
+        proportion.
+
+        :param train_prop: the proportion of elements to include in the left-most returned collection (typically used
+            as the training collection). The rest of elements are included in the right-most returned collection
+            (typically used as a test collection).
+        :param random_state: if specified, guarantees reproducibility of the split.
+        :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
+            second one with `1-train_prop` elements
+        """
        tr_docs, te_docs, tr_labels, te_labels = \
            train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
                             random_state=random_state)
        return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)

    def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
+        """
+        A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
+        a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
+        prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
+        [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
+        combination of prevalence values is indicated by `repeats`
+
+        :param sample_size: the number of instances in each sample
+        :param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
+            limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
+        :param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
+        :return: yield samples generated at artificially controlled prevalence values
+        """
        dimensions = self.n_classes
        for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
            yield self.sampling(sample_size, *prevs)

    def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
+        """
+        A generator of sample indexes implementing the artificial prevalence protocol (APP).
+        The APP consists of exploring
+        a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
+        prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
+        [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of sample indexes for each valid
+        combination of prevalence values is indicated by `repeats`
+
+        :param sample_size: the number of instances in each sample (i.e., length of each index)
+        :param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
+            limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
+        :param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
+        :return: yield the indexes that generate the samples according to APP
+        """
        dimensions = self.n_classes
        for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
            yield self.sampling_index(sample_size, *prevs)

    def natural_sampling_generator(self, sample_size, repeats=100):
+        """
+        A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
+        samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
+
+        :param sample_size: integer, the number of instances in each sample
+        :param repeats: the number of samples to generate
+        :return: yield instances of :class:`LabelledCollection`
+        """
        for _ in range(repeats):
            yield self.uniform_sampling(sample_size)

    def natural_sampling_index_generator(self, sample_size, repeats=100):
+        """
+        A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
+        samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
+
+        :param sample_size: integer, the number of instances in each sample (i.e., the length of each index)
+        :param repeats: the number of indexes to generate
+        :return: yield `repeats` instances of np.ndarray with shape `(sample_size,)`
+        """
        for _ in range(repeats):
            yield self.uniform_sampling_index(sample_size)

    def __add__(self, other):
+        """
+        Returns a new :class:`LabelledCollection` as the union of this collection with another collection
+
+        :param other: another :class:`LabelledCollection`
+        :return: a :class:`LabelledCollection` representing the union of both collections
+        """
        if other is None:
            return self
        elif issparse(self.instances) and issparse(other.instances):
@ -149,9 +295,29 @@ class LabelledCollection:

    @property
    def Xy(self):
+        """
+        Gets the instances and labels. This is useful when working with `sklearn` estimators, e.g.:
+
+        >>> svm = LinearSVC().fit(*my_collection.Xy)
+
+        :return: a tuple `(instances, labels)` from this collection
+        """
        return self.instances, self.labels

    def stats(self, show=True):
+        """
+        Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:
+
+        >>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
+        >>> data.training.stats()
+        >>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]
+
+        :param show: if set to True (default), prints the stats in standard output
+        :return: a dictionary containing some stats of this collection. Keys include `#instances` (the number of
+            instances), `type` (the type representing the instances), `#features` (the number of features, if the
+            instances are in array-like format), `#classes` (the classes of the collection), `prevs` (the prevalence
+            values for each class)
+        """
        ninstances = len(self)
        instance_type = type(self.instances[0])
        if instance_type == list:
@ -171,6 +337,14 @@ class LabelledCollection:
        return stats_

    def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
+        """
+        Generator of stratified folds to be used in k-fold cross validation.
+
+        :param nfolds: integer (default 5), the number of folds to generate
+        :param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
+        :param random_state: integer (default 0), guarantees that the folds generated are reproducible
+        :return: yields `nfolds * nrepeats` folds for k-fold cross validation
+        """
        kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
        for train_index, test_index in kf.split(*self.Xy):
            train = self.sampling_from_index(train_index)
@ -178,8 +352,15 @@ class LabelledCollection:
            yield train, test


-
 class Dataset:
+    """
+    Abstraction of training and test :class:`LabelledCollection` objects.
+
+    :param training: a :class:`LabelledCollection` instance
+    :param test: a :class:`LabelledCollection` instance
+    :param vocabulary: if indicated, is a dictionary of the terms used in this textual dataset
+    :param name: a string representing the name of the dataset
+    """

    def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
        assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
@ -190,45 +371,118 @@ class Dataset:

    @classmethod
    def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
+        """
+        Generates a :class:`Dataset` from a stratified split of a :class:`LabelledCollection` instance.
+        See :meth:`LabelledCollection.split_stratified`
+
+        :param collection: :class:`LabelledCollection`
+        :param train_size: the proportion of training documents (the rest conforms the test split)
+        :return: an instance of :class:`Dataset`
+        """
        return Dataset(*collection.split_stratified(train_prop=train_size))

    @property
    def classes_(self):
+        """
+        The classes according to which the training collection is labelled
+
+        :return: The classes according to which the training collection is labelled
+        """
        return self.training.classes_

    @property
    def n_classes(self):
+        """
+        The number of classes according to which the training collection is labelled
+
+        :return: integer
+        """
        return self.training.n_classes

    @property
    def binary(self):
+        """
+        Returns True if the training collection is labelled according to two classes
+
+        :return: boolean
+        """
        return self.training.binary

    @classmethod
-    def load(cls, train_path, test_path, loader_func: callable):
-        training = LabelledCollection.load(train_path, loader_func)
-        test = LabelledCollection.load(test_path, loader_func)
+    def load(cls, train_path, test_path, loader_func: callable, classes=None, **loader_kwargs):
+        """
+        Loads a training and a test labelled set of data and convert it into a :class:`Dataset` instance.
+        The function in charge of reading the instances must be specified. This function can be a custom one, or any of
+        the reading functions defined in :mod:`quapy.data.reader` module.
+
+        :param train_path: string, the path to the file containing the training instances
+        :param test_path: string, the path to the file containing the test instances
+        :param loader_func: a custom function that implements the data loader and returns a tuple with instances and
+            labels
+        :param classes: array-like, the classes according to which the instances are labelled
+        :param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances.
+            See :meth:`LabelledCollection.load` for further details.
+        :return: a :class:`Dataset` object
+        """
+
+        training = LabelledCollection.load(train_path, loader_func, classes, **loader_kwargs)
+        test = LabelledCollection.load(test_path, loader_func, classes, **loader_kwargs)
        return Dataset(training, test)

    @property
    def vocabulary_size(self):
+        """
+        If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary
+
+        :return: integer
+        """
        return len(self.vocabulary)

-    def stats(self):
+    def stats(self, show):
+        """
+        Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:
+
+        >>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
+        >>> data.stats()
+        >>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]
+
+        :param show: if set to True (default), prints the stats in standard output
+        :return: a dictionary containing some stats of this collection for the training and test collections. The keys
+            are `train` and `test`, and point to dedicated dictionaries of stats, for each collection, with keys
+            `#instances` (the number of instances), `type` (the type representing the instances),
+            `#features` (the number of features, if the instances are in array-like format), `#classes` (the classes of
+            the collection), `prevs` (the prevalence values for each class)
+        """
        tr_stats = self.training.stats(show=False)
        te_stats = self.test.stats(show=False)
-        print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
-              f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
-              f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
+        if show:
+            print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
+                  f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
+                  f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
        return {'train': tr_stats, 'test': te_stats}

    @classmethod
    def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
+        """
+        Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
+        :meth:`LabelledCollection.kFCV` that returns :class:`Dataset` instances made of training and test folds.
+
+        :param nfolds: integer (default 5), the number of folds to generate
+        :param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
+        :param random_state: integer (default 0), guarantees that the folds generated are reproducible
+        :return: yields `nfolds * nrepeats` folds for k-fold cross validation as instances of :class:`Dataset`
+        """
        for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
            yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')


 def isbinary(data):
+    """
+    Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
+
+    :param data: a :class:`Dataset` or a :class:`LabelledCollection` object
+    :return: True if labelled according to two classes
+    """
    if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
        return data.binary
    return False
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -5,9 +5,6 @@ warnings.warn = warn
 import os
 import zipfile
 from os.path import join
-from urllib.error import HTTPError
-from sklearn.model_selection import StratifiedKFold
-
 import pandas as pd

 from quapy.data.base import Dataset, LabelledCollection
@ -49,18 +46,20 @@ UCI_DATASETS = ['acute.a', 'acute.b',

 def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
-    Load a Reviews dataset as a Dataset instance, as used in:
-    Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
-    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
+    Loads a Reviews dataset as a Dataset instance, as used in
+    `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
+    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
+    The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
+
    :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
    :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
    :param min_df: minimun number of documents that should contain a term in order for the term to be
-    kept (ignored if tfidf==False)
+        kept (ignored if tfidf==False)
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-    ~/quay_data/ directory)
+        ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-    faster subsequent invokations
-    :return: a Dataset instance
+        faster subsequent invokations
+    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
@ -93,22 +92,25 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle

 def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
-    Load a Twitter dataset as a Dataset instance, as used in:
-    Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
-    Social Network Analysis and Mining6(19), 1–22 (2016)
-    The datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
+    Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
+    `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
+    Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
+    Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
+    The list of valid dataset names corresponding to training sets can be accessed in
+    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
+    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`

    :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
-    'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
+        'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
    :param for_model_selection: if True, then returns the train split as the training set and the devel split
-    as the test set; if False, then returns the train+devel split as the training set and the test set as the
-    test set
+        as the test set; if False, then returns the train+devel split as the training set and the test set as the
+        test set
    :param min_df: minimun number of documents that should contain a term in order for the term to be kept
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-    ~/quay_data/ directory)
+        ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-    faster subsequent invokations
-    :return: a Dataset instance
+        faster subsequent invokations
+    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
@ -163,11 +165,58 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom


 def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
+    """
+    Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
+    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
+    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
+    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
+    and
+    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
+    Dynamic ensemble selection for quantification tasks.
+    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
+    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
+    information on how to use these collections), and so a train-test split is generated at desired proportion.
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
+
+    :param dataset_name: a dataset name
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
+    :return: a :class:`quapy.data.base.Dataset` instance
+    """
    data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
    return Dataset(*data.split_stratified(1 - test_split, random_state=0))


 def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
+    """
+    Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
+    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
+    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
+    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
+    and
+    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
+    Dynamic ensemble selection for quantification tasks.
+    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
+    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
+    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
+    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
+
+    >>> import quapy as qp
+    >>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
+    >>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
+    >>>     ...
+
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
+
+    :param dataset_name: a dataset name
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
+    :return: a :class:`quapy.data.base.Dataset` instance
+    """

    assert dataset_name in UCI_DATASETS, \
        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
@ -302,7 +351,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')

        df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
-        [df_replace(df, col) for col in range(1, 6)]
+        [_df_replace(df, col) for col in range(1, 6)]
        X = df.loc[:, 0:5].values
        if dataset_name == 'acute.a':
            y = binarize(df[6], pos_class='yes')
@ -482,5 +531,5 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
    return data


-def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
+def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
--- a/quapy/data/preprocessing.py
+++ b/quapy/data/preprocessing.py
@ -12,14 +12,18 @@ from .base import LabelledCollection

 def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
    """
-    Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
-    :param dataset: a Dataset where the instances are lists of str
-    :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
-    :param sublinear_tf: whether or not to apply the log scalling to the tf counters
-    :param inplace: whether or not to apply the transformation inplace, or to a new copy
-    :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
-    :return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
-    where the instances are stored in a csr_matrix of real-valued tfidf scores
+    Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of
+    tfidf weighted sparse vectors
+
+    :param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are
+        lists of str
+    :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)
+    :param sublinear_tf: whether or not to apply the log scalling to the tf counters (default True)
+    :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
+    :param kwargs: the rest of parameters of the transformation (as for sklearn's
+        `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_)
+    :return: a new :class:`quapy.data.base.Dataset` in `csr_matrix` format (if inplace=False) or a reference to the
+        current Dataset (if inplace=True) where the instances are stored in a `csr_matrix` of real-valued tfidf scores
    """
    __check_type(dataset.training.instances, np.ndarray, str)
    __check_type(dataset.test.instances, np.ndarray, str)
@ -41,13 +45,17 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw

 def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
    """
-    Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
-    _min_df_ instances
-    :param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
-    :param min_df: minimum number of instances below which the columns are removed
-    :param inplace: whether or not to apply the transformation inplace, or to a new copy
-    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
-    where the dimensions corresponding to infrequent instances have been removed
+    Reduces the dimensionality of the instances, represented as a `csr_matrix` (or any subtype of
+    `scipy.sparse.spmatrix`), of training and test documents by removing the columns of words which are not present
+    in at least `min_df` instances in the training set
+
+    :param dataset: a :class:`quapy.data.base.Dataset` in which instances are represented in sparse format (any
+        subtype of scipy.sparse.spmatrix)
+    :param min_df: integer, minimum number of instances below which the columns are removed
+    :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
+    :return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
+        :class:`quapy.data.base.Dataset` (inplace=True) where the dimensions corresponding to infrequent terms
+        in the training set have been removed
    """
    __check_type(dataset.training.instances, spmatrix)
    __check_type(dataset.test.instances, spmatrix)
@ -71,7 +79,17 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
        return Dataset(training, test)


-def standardize(dataset: Dataset, inplace=True):
+def standardize(dataset: Dataset, inplace=False):
+    """
+    Standardizes the real-valued columns of a :class:`quapy.data.base.Dataset`.
+    Standardization, aka z-scoring, of a variable `X` comes down to subtracting the average and normalizing by the
+    standard deviation.
+
+    :param dataset: a :class:`quapy.data.base.Dataset` object
+    :param inplace: set to True if the transformation is to be applied inplace, or to False (default) if a new
+        :class:`quapy.data.base.Dataset` is to be returned
+    :return:
+    """
    s = StandardScaler(copy=not inplace)
    training = s.fit_transform(dataset.training.instances)
    test = s.transform(dataset.test.instances)
@ -83,14 +101,18 @@ def standardize(dataset: Dataset, inplace=True):

 def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
    """
-    Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
-    Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
-    :param dataset: a Dataset where the instances are lists of str
-    :param min_df: minimum number of instances below which the term is replaced by a UNK index
-    :param inplace: whether or not to apply the transformation inplace, or to a new copy
-    :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
-    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
-    consisting of lists of integer values representing indices.
+    Indexes the tokens of a textual :class:`quapy.data.base.Dataset` of string documents.
+    To index a document means to replace each different token by a unique numerical index.
+    Rare words (i.e., words occurring less than `min_df` times) are replaced by a special token `UNK`
+
+    :param dataset: a :class:`quapy.data.base.Dataset` object where the instances of training and test documents
+        are lists of str
+    :param min_df: minimum number of occurrences below which the term is replaced by a `UNK` index
+    :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
+    :param kwargs: the rest of parameters of the transformation (as for sklearn's
+    `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_`)
+    :return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
+        :class:`quapy.data.base.Dataset` (inplace=True) consisting of lists of integer values representing indices.
    """
    __check_type(dataset.training.instances, np.ndarray, str)
    __check_type(dataset.test.instances, np.ndarray, str)
@ -120,17 +142,23 @@ def __check_type(container, container_type=None, element_type=None):


 class IndexTransformer:
+    """
+    This class implements a sklearn's-style transformer that indexes text as numerical ids for the tokens it
+    contains, and that would be generated by sklearn's
+    `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
+
+    :param kwargs: keyworded arguments from `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
+    """

    def __init__(self, **kwargs):
-        """
-        :param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
-        """
        self.vect = CountVectorizer(**kwargs)
        self.unk = -1  # a valid index is assigned after fit
        self.pad = -2  # a valid index is assigned after fit

    def fit(self, X):
        """
+        Fits the transformer, i.e., decides on the vocabulary, given a list of strings.
+
        :param X: a list of strings
        :return: self
        """
@ -142,22 +170,52 @@ class IndexTransformer:
        return self

    def transform(self, X, n_jobs=-1):
+        """
+        Transforms the strings in `X` as lists of numerical ids
+
+        :param X: a list of strings
+        :param n_jobs: the number of parallel workers to carry out this task
+        :return: a `np.ndarray` of numerical ids
+        """
        # given the number of tasks and the number of jobs, generates the slices for the parallel processes
        assert self.unk != -1, 'transform called before fit'
-        indexed = map_parallel(func=self.index, args=X, n_jobs=n_jobs)
+        indexed = map_parallel(func=self._index, args=X, n_jobs=n_jobs)
        return np.asarray(indexed)

-    def index(self, documents):
+    def _index(self, documents):
        vocab = self.vocabulary_.copy()
        return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]

    def fit_transform(self, X, n_jobs=-1):
+        """
+        Fits the transform on `X` and transforms it.
+
+        :param X: a list of strings
+        :param n_jobs: the number of parallel workers to carry out this task
+        :return: a `np.ndarray` of numerical ids
+        """
        return self.fit(X).transform(X, n_jobs=n_jobs)

    def vocabulary_size(self):
+        """
+        Gets the length of the vocabulary according to which the document tokens have been indexed
+
+        :return: integer
+        """
        return len(self.vocabulary_)

    def add_word(self, word, id=None, nogaps=True):
+        """
+        Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
+        Useful to define special tokens for codifying unknown words, or padding tokens.
+
+        :param word: string, surface form of the token
+        :param id: integer, numerical value to assign to the token (leave as None for indicating the next valid id,
+            default)
+        :param nogaps: if set to True (default) asserts that the id indicated leads to no numerical gaps with
+            precedent ids stored so far
+        :return: integer, the numerical id for the new token
+        """
        if word in self.vocabulary_:
            raise ValueError(f'word {word} already in dictionary')
        if id is None:
--- a/quapy/data/reader.py
+++ b/quapy/data/reader.py
@ -7,7 +7,10 @@ def from_text(path, encoding='utf-8', verbose=1, class2int=True):
    """
    Reads a labelled colletion of documents.
    File fomart <0 or 1>\t<document>\n
+
    :param path: path to the labelled collection
+    :param encoding: the text encoding used to open the file
+    :param verbose: if >0 (default) shows some progress information in standard output
    :return: a list of sentences, and a list of labels
    """
    all_sentences, all_labels = [], []
@ -35,8 +38,9 @@ def from_sparse(path):
    """
    Reads a labelled collection of real-valued instances expressed in sparse format
    File format <-1 or 0 or 1>[\s col(int):val(float)]\n
+
    :param path: path to the labelled collection
-    :return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
+    :return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels
    """

    def split_col_val(col_val):
@ -68,8 +72,10 @@ def from_csv(path, encoding='utf-8'):
    """
    Reads a csv file in which columns are separated by ','.
    File format <label>,<feat1>,<feat2>,...,<featn>\n
+
    :param path: path to the csv file
-    :return: a ndarray for the labels and a ndarray (float) for the covariates
+    :param encoding: the text encoding used to open the file
+    :return: a np.ndarray for the labels and a ndarray (float) for the covariates
    """

    X, y = [], []
@ -85,11 +91,16 @@ def from_csv(path, encoding='utf-8'):
 def reindex_labels(y):
    """
    Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
-    E.g., y=['B', 'B', 'A', 'C'] -> [1,1,0,2], ['A','B','C']
+    E.g.:
+
+    >>> reindex_labels(['B', 'B', 'A', 'C'])
+    >>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))
+
    :param y: the list or array of original labels
    :return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.
    """
-    classnames = sorted(np.unique(y))
+    y = np.asarray(y)
+    classnames = np.asarray(sorted(np.unique(y)))
    label2index = {label: index for index, label in enumerate(classnames)}
    indexed = np.empty(y.shape, dtype=np.int)
    for label in classnames:
@ -98,6 +109,17 @@ def reindex_labels(y):


 def binarize(y, pos_class):
+    """
+    Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,:
+
+    >>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)
+    >>> array([0, 1, 0, 0, 0, 0])
+
+    :param y: array-like of labels
+    :param pos_class: integer, the positive class
+    :return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and
+        0 otherwise
+    """
    y = np.asarray(y)
    ybin = np.zeros(y.shape, dtype=np.int)
    ybin[y == pos_class] = 1