forked from moreo/QuaPy
updating the documentation
This commit is contained in:
parent
1f591ec105
commit
2bd47f0841
|
@ -255,12 +255,8 @@
|
|||
<li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.device">(quapy.method.neural.QuaNetModule property)</a>
|
||||
</li>
|
||||
</ul></li>
|
||||
<li><a href="quapy.data.html#quapy.data.datasets.df_replace">df_replace() (in module quapy.data.datasets)</a>
|
||||
</li>
|
||||
<li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.dimensions">dimensions() (quapy.classification.neural.TextClassifierNet method)</a>
|
||||
</li>
|
||||
</ul></td>
|
||||
<td style="width: 33%; vertical-align: top;"><ul>
|
||||
<li><a href="quapy.classification.html#quapy.classification.neural.CNNnet.document_embedding">document_embedding() (quapy.classification.neural.CNNnet method)</a>
|
||||
|
||||
<ul>
|
||||
|
@ -269,6 +265,8 @@
|
|||
<li><a href="quapy.classification.html#quapy.classification.neural.TextClassifierNet.document_embedding">(quapy.classification.neural.TextClassifierNet method)</a>
|
||||
</li>
|
||||
</ul></li>
|
||||
</ul></td>
|
||||
<td style="width: 33%; vertical-align: top;"><ul>
|
||||
<li><a href="quapy.html#quapy.util.download_file">download_file() (in module quapy.util)</a>
|
||||
</li>
|
||||
<li><a href="quapy.html#quapy.util.download_file_if_not_exists">download_file_if_not_exists() (in module quapy.util)</a>
|
||||
|
@ -462,19 +460,15 @@
|
|||
<table style="width: 100%" class="indextable genindextable"><tr>
|
||||
<td style="width: 33%; vertical-align: top;"><ul>
|
||||
<li><a href="quapy.data.html#quapy.data.preprocessing.index">index() (in module quapy.data.preprocessing)</a>
|
||||
|
||||
<ul>
|
||||
<li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer.index">(quapy.data.preprocessing.IndexTransformer method)</a>
|
||||
</li>
|
||||
</ul></li>
|
||||
<li><a href="quapy.data.html#quapy.data.preprocessing.IndexTransformer">IndexTransformer (class in quapy.data.preprocessing)</a>
|
||||
</li>
|
||||
<li><a href="quapy.method.html#quapy.method.neural.QuaNetModule.init_hidden">init_hidden() (quapy.method.neural.QuaNetModule method)</a>
|
||||
</li>
|
||||
</ul></td>
|
||||
<td style="width: 33%; vertical-align: top;"><ul>
|
||||
<li><a href="quapy.method.html#quapy.method.base.isaggregative">isaggregative() (in module quapy.method.base)</a>
|
||||
</li>
|
||||
</ul></td>
|
||||
<td style="width: 33%; vertical-align: top;"><ul>
|
||||
<li><a href="quapy.html#quapy.isbinary">isbinary() (in module quapy)</a>
|
||||
|
||||
<ul>
|
||||
|
|
Binary file not shown.
|
@ -63,45 +63,144 @@
|
|||
<dt class="sig sig-object py" id="quapy.data.base.Dataset">
|
||||
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">Dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">training</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">vocabulary</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||||
<p>Abstraction of training and test <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> objects.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>training</strong> – a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
|
||||
<li><p><strong>test</strong> – a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
|
||||
<li><p><strong>vocabulary</strong> – if indicated, is a dictionary of the terms used in this textual dataset</p></li>
|
||||
<li><p><strong>name</strong> – a string representing the name of the dataset</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.Dataset.SplitStratified">
|
||||
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">SplitStratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">collection</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">train_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.SplitStratified" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Generates a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> from a stratified split of a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance.
|
||||
See <a class="reference internal" href="#quapy.data.base.LabelledCollection.split_stratified" title="quapy.data.base.LabelledCollection.split_stratified"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.split_stratified()</span></code></a></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>collection</strong> – <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p></li>
|
||||
<li><p><strong>train_size</strong> – the proportion of training documents (the rest conforms the test split)</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py property">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.Dataset.binary">
|
||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.Dataset.binary" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns True if the training collection is labelled according to two classes</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>boolean</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py property">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.Dataset.classes_">
|
||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.data.base.Dataset.classes_" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>The classes according to which the training collection is labelled</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>The classes according to which the training collection is labelled</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.Dataset.kFCV">
|
||||
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.kFCV" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
|
||||
<a class="reference internal" href="#quapy.data.base.LabelledCollection.kFCV" title="quapy.data.base.LabelledCollection.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.kFCV()</span></code></a> that returns <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instances made of training and test folds.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>nfolds</strong> – integer (default 5), the number of folds to generate</p></li>
|
||||
<li><p><strong>nrepeats</strong> – integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
|
||||
<li><p><strong>random_state</strong> – integer (default 0), guarantees that the folds generated are reproducible</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation as instances of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.Dataset.load">
|
||||
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Loads a training and a test labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instance.
|
||||
The function in charge of reading the instances must be specified. This function can be a custom one, or any of
|
||||
the reading functions defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>train_path</strong> – string, the path to the file containing the training instances</p></li>
|
||||
<li><p><strong>test_path</strong> – string, the path to the file containing the test instances</p></li>
|
||||
<li><p><strong>loader_func</strong> – a custom function that implements the data loader and returns a tuple with instances and
|
||||
labels</p></li>
|
||||
<li><p><strong>classes</strong> – array-like, the classes according to which the instances are labelled</p></li>
|
||||
<li><p><strong>loader_kwargs</strong> – any argument that the <cite>loader_func</cite> function needs in order to read the instances.
|
||||
See <a class="reference internal" href="#quapy.data.base.LabelledCollection.load" title="quapy.data.base.LabelledCollection.load"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.load()</span></code></a> for further details.</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py property">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.Dataset.n_classes">
|
||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.Dataset.n_classes" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>The number of classes according to which the training collection is labelled</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>integer</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.Dataset.stats">
|
||||
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">'kindle'</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">data</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
|
||||
<span class="gp">>>> </span><span class="n">Dataset</span><span class="o">=</span><span class="n">kindle</span> <span class="c1">#tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>show</strong> – if set to True (default), prints the stats in standard output</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a dictionary containing some stats of this collection for the training and test collections. The keys
|
||||
are <cite>train</cite> and <cite>test</cite>, and point to dedicated dictionaries of stats, for each collection, with keys
|
||||
<cite>#instances</cite> (the number of instances), <cite>type</cite> (the type representing the instances),
|
||||
<cite>#features</cite> (the number of features, if the instances are in array-like format), <cite>#classes</cite> (the classes of
|
||||
the collection), <cite>prevs</cite> (the prevalence values for each class)</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py property">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.Dataset.vocabulary_size">
|
||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">vocabulary_size</span></span><a class="headerlink" href="#quapy.data.base.Dataset.vocabulary_size" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>integer</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
|
||||
|
@ -109,161 +208,480 @@
|
|||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection">
|
||||
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">LabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||||
<p>A LabelledCollection is a set of objects each with a label associated to it.</p>
|
||||
<p>A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
|
||||
routines.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>instances</strong> – array-like (np.ndarray, list, or csr_matrix are supported)</p></li>
|
||||
<li><p><strong>labels</strong> – array-like with the same length of instances</p></li>
|
||||
<li><p><strong>classes</strong> – optional, list of classes from which labels are taken. If not specified, the classes are inferred
|
||||
from the labels. The classes must be indicated in cases in which some of the labels might have no examples
|
||||
(i.e., a prevalence of 0)</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="py property">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.Xy">
|
||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">Xy</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.Xy" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Gets the instances and labels. This is useful when working with <cite>sklearn</cite> estimators, e.g.:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">svm</span> <span class="o">=</span> <span class="n">LinearSVC</span><span class="p">()</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="o">*</span><span class="n">my_collection</span><span class="o">.</span><span class="n">Xy</span><span class="p">)</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>a tuple <cite>(instances, labels)</cite> from this collection</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_generator">
|
||||
<span class="sig-name descname"><span class="pre">artificial_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_generator" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
|
||||
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
|
||||
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
|
||||
[1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of samples for each valid
|
||||
combination of prevalence values is indicated by <cite>repeats</cite></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>sample_size</strong> – the number of instances in each sample</p></li>
|
||||
<li><p><strong>n_prevalences</strong> – the number of prevalence points to be taken from the [0,1] interval (including the
|
||||
limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
|
||||
<li><p><strong>repeats</strong> – the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>yield samples generated at artificially controlled prevalence values</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.artificial_sampling_index_generator">
|
||||
<span class="sig-name descname"><span class="pre">artificial_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">101</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.artificial_sampling_index_generator" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>A generator of sample indexes implementing the artificial prevalence protocol (APP).
|
||||
The APP consists of exploring
|
||||
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, …, 1]), and generating all valid combinations of
|
||||
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], …,
|
||||
[1, 0, 0] prevalence values of size <cite>sample_size</cite> will be yielded). The number of sample indexes for each valid
|
||||
combination of prevalence values is indicated by <cite>repeats</cite></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>sample_size</strong> – the number of instances in each sample (i.e., length of each index)</p></li>
|
||||
<li><p><strong>n_prevalences</strong> – the number of prevalence points to be taken from the [0,1] interval (including the
|
||||
limits {0,1}). E.g., if <cite>n_prevalences=11</cite>, then the prevalence points to take are [0, 0.1, 0.2, …, 1]</p></li>
|
||||
<li><p><strong>repeats</strong> – the number of samples to generate for each valid combination of prevalence values (default 1)</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>yield the indexes that generate the samples according to APP</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py property">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.binary">
|
||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.binary" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns True if the number of classes is 2</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>boolean</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.counts">
|
||||
<span class="sig-name descname"><span class="pre">counts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.counts" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns the number of instances for each of the classes of interest.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the number of instances of each class, in the same order
|
||||
as listed by <cite>self.classes_</cite></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.kFCV">
|
||||
<span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.kFCV" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Generator of stratified folds to be used in k-fold cross validation.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>nfolds</strong> – integer (default 5), the number of folds to generate</p></li>
|
||||
<li><p><strong>nrepeats</strong> – integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
|
||||
<li><p><strong>random_state</strong> – integer (default 0), guarantees that the folds generated are reproducible</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.load">
|
||||
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<em class="property"><span class="pre">classmethod</span> </em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Loads a labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance. The function in charge
|
||||
of reading the instances must be specified. This function can be a custom one, or any of the reading functions
|
||||
defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>path</strong> – string, the path to the file containing the labelled instances</p></li>
|
||||
<li><p><strong>loader_func</strong> – a custom function that implements the data loader and returns a tuple with instances and
|
||||
labels</p></li>
|
||||
<li><p><strong>classes</strong> – array-like, the classes according to which the instances are labelled</p></li>
|
||||
<li><p><strong>loader_kwargs</strong> – any argument that the <cite>loader_func</cite> function needs in order to read the instances, i.e.,
|
||||
these arguments are used to call <cite>loader_func(path, **loader_kwargs)</cite></p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py property">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.n_classes">
|
||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.n_classes" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>The number of classes</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>integer</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_generator">
|
||||
<span class="sig-name descname"><span class="pre">natural_sampling_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_generator" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
|
||||
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>sample_size</strong> – integer, the number of instances in each sample</p></li>
|
||||
<li><p><strong>repeats</strong> – the number of samples to generate</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>yield instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.natural_sampling_index_generator">
|
||||
<span class="sig-name descname"><span class="pre">natural_sampling_index_generator</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.natural_sampling_index_generator" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
|
||||
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>sample_size</strong> – integer, the number of instances in each sample (i.e., the length of each index)</p></li>
|
||||
<li><p><strong>repeats</strong> – the number of indexes to generate</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>yield <cite>repeats</cite> instances of np.ndarray with shape <cite>(sample_size,)</cite></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.prevalence">
|
||||
<span class="sig-name descname"><span class="pre">prevalence</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.prevalence" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns the prevalence, or relative frequency, of the classes of interest.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the relative frequencies of each class, in the same order
|
||||
as listed by <cite>self.classes_</cite></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling">
|
||||
<span class="sig-name descname"><span class="pre">sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Return a random sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size and desired prevalence
|
||||
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
|
||||
the actual prevalence of the class, or with replacement otherwise.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>size</strong> – integer, the requested size</p></li>
|
||||
<li><p><strong>prevs</strong> – the prevalence for each class; the prevalence value for the last class can be lead empty since
|
||||
it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
|
||||
<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
|
||||
<li><p><strong>shuffle</strong> – if set to True (default), shuffles the index before returning it</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite> and prevalence close to <cite>prevs</cite> (or
|
||||
prevalence == <cite>prevs</cite> if the exact prevalence values can be met as proportions of instances)</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_from_index">
|
||||
<span class="sig-name descname"><span class="pre">sampling_from_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">index</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_from_index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> whose elements are sampled from this collection using the
|
||||
index.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>index</strong> – np.ndarray</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_index">
|
||||
<span class="sig-name descname"><span class="pre">sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
|
||||
prevalence values are not specified, then returns the index of a uniform sampling.
|
||||
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
|
||||
the actual prevalence of the class, or with replacement otherwise.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>size</strong> – integer, the requested size</p></li>
|
||||
<li><p><strong>prevs</strong> – the prevalence for each class; the prevalence value for the last class can be lead empty since
|
||||
it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
|
||||
<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
|
||||
<li><p><strong>shuffle</strong> – if set to True (default), shuffles the index before returning it</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.split_stratified">
|
||||
<span class="sig-name descname"><span class="pre">split_stratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_prop</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.split_stratified" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> split with stratification from this collection, at desired
|
||||
proportion.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>train_prop</strong> – the proportion of elements to include in the left-most returned collection (typically used
|
||||
as the training collection). The rest of elements are included in the right-most returned collection
|
||||
(typically used as a test collection).</p></li>
|
||||
<li><p><strong>random_state</strong> – if specified, guarantees reproducibility of the split.</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>, the first one with <cite>train_prop</cite> elements, and the
|
||||
second one with <cite>1-train_prop</cite> elements</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.stats">
|
||||
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.stats" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">'kindle'</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">data</span><span class="o">.</span><span class="n">training</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
|
||||
<span class="gp">>>> </span><span class="c1">#instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>show</strong> – if set to True (default), prints the stats in standard output</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a dictionary containing some stats of this collection. Keys include <cite>#instances</cite> (the number of
|
||||
instances), <cite>type</cite> (the type representing the instances), <cite>#features</cite> (the number of features, if the
|
||||
instances are in array-like format), <cite>#classes</cite> (the classes of the collection), <cite>prevs</cite> (the prevalence
|
||||
values for each class)</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling">
|
||||
<span class="sig-name descname"><span class="pre">uniform_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns a uniform sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size. The sampling is drawn
|
||||
without replacement if the requested size is greater than the number of instances, or with replacement
|
||||
otherwise.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>size</strong> – integer, the requested size</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling_index">
|
||||
<span class="sig-name descname"><span class="pre">uniform_sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling_index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
|
||||
without replacement if the requested size is greater than the number of instances, or with replacement
|
||||
otherwise.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>size</strong> – integer, the size of the uniform sample</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.base.isbinary">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">isbinary</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.base.isbinary" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns True if <cite>data</cite> is either a binary <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a binary <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>data</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> or a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>True if labelled according to two classes</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</section>
|
||||
<section id="module-quapy.data.datasets">
|
||||
<span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Permalink to this headline">¶</a></h2>
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.datasets.df_replace">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">df_replace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">df</span></em>, <em class="sig-param"><span class="pre">col</span></em>, <em class="sig-param"><span class="pre">repl={'no':</span> <span class="pre">0</span></em>, <em class="sig-param"><span class="pre">'yes':</span> <span class="pre">1}</span></em>, <em class="sig-param"><span class="pre">astype=<class</span> <span class="pre">'float'></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.datasets.df_replace" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIDataset">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIDataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_split</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCIDataset" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Loads a UCI dataset as an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>, as used in
|
||||
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
||||
Information Fusion, 34, 87-100.</a>
|
||||
and
|
||||
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
||||
Dynamic ensemble selection for quantification tasks.
|
||||
Information Fusion, 45, 1-15.</a>.
|
||||
The datasets do not come with a predefined train-test split (see <a class="reference internal" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="quapy.data.datasets.fetch_UCILabelledCollection"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fetch_UCILabelledCollection()</span></code></a> for further
|
||||
information on how to use these collections), and so a train-test split is generated at desired proportion.
|
||||
The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dataset_name</strong> – a dataset name</p></li>
|
||||
<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)</p></li>
|
||||
<li><p><strong>test_split</strong> – proportion of documents to be included in the test set. The rest conforms the training set</p></li>
|
||||
<li><p><strong>verbose</strong> – set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCILabelledCollection">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCILabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_UCILabelledCollection" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Loads a UCI collection as an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, as used in
|
||||
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
||||
Information Fusion, 34, 87-100.</a>
|
||||
and
|
||||
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
||||
Dynamic ensemble selection for quantification tasks.
|
||||
Information Fusion, 45, 1-15.</a>.
|
||||
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
|
||||
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
|
||||
This can be reproduced by using <a class="reference internal" href="#quapy.data.base.Dataset.kFCV" title="quapy.data.base.Dataset.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.data.base.Dataset.kFCV()</span></code></a>, e.g.:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
|
||||
<span class="gp">>>> </span><span class="n">collection</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCILabelledCollection</span><span class="p">(</span><span class="s2">"yeast"</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="k">for</span> <span class="n">data</span> <span class="ow">in</span> <span class="n">qp</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">Dataset</span><span class="o">.</span><span class="n">kFCV</span><span class="p">(</span><span class="n">collection</span><span class="p">,</span> <span class="n">nfolds</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">nrepeats</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
|
||||
<span class="gp">>>> </span> <span class="o">...</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dataset_name</strong> – a dataset name</p></li>
|
||||
<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)</p></li>
|
||||
<li><p><strong>test_split</strong> – proportion of documents to be included in the test set. The rest conforms the training set</p></li>
|
||||
<li><p><strong>verbose</strong> – set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_reviews">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_reviews</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tfidf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_reviews" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Load a Reviews dataset as a Dataset instance, as used in:
|
||||
Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
|
||||
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
|
||||
:param dataset_name: the name of the dataset: valid ones are ‘hp’, ‘kindle’, ‘imdb’
|
||||
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
|
||||
:param min_df: minimun number of documents that should contain a term in order for the term to be
|
||||
kept (ignored if tfidf==False)
|
||||
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
||||
faster subsequent invokations
|
||||
:return: a Dataset instance</p>
|
||||
<dd><p>Loads a Reviews dataset as a Dataset instance, as used in
|
||||
<a class="reference external" href="https://dl.acm.org/doi/abs/10.1145/3269206.3269287">Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
|
||||
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.</a>.
|
||||
The list of valid dataset names can be accessed in <cite>quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS</cite></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘hp’, ‘kindle’, ‘imdb’</p></li>
|
||||
<li><p><strong>tfidf</strong> – set to True to transform the raw documents into tfidf weighted matrices</p></li>
|
||||
<li><p><strong>min_df</strong> – minimun number of documents that should contain a term in order for the term to be
|
||||
kept (ignored if tfidf==False)</p></li>
|
||||
<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)</p></li>
|
||||
<li><p><strong>pickle</strong> – set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
||||
faster subsequent invokations</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_twitter">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_twitter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></span><a class="headerlink" href="#quapy.data.datasets.fetch_twitter" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Load a Twitter dataset as a Dataset instance, as used in:
|
||||
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||
The datasets ‘semeval13’, ‘semeval14’, ‘semeval15’ share the same training set.</p>
|
||||
<dd><p>Loads a Twitter dataset as a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance, as used in:
|
||||
<a class="reference external" href="https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf">Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
Social Network Analysis and Mining6(19), 1–22 (2016)</a>
|
||||
Note that the datasets ‘semeval13’, ‘semeval14’, ‘semeval15’ share the same training set.
|
||||
The list of valid dataset names corresponding to training sets can be accessed in
|
||||
<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN</cite>, while the test sets can be accessed in
|
||||
<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST</cite></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘gasp’, ‘hcr’, ‘omd’, ‘sanders’, ‘semeval13’,</p>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dataset_name</strong> – the name of the dataset: valid ones are ‘gasp’, ‘hcr’, ‘omd’, ‘sanders’, ‘semeval13’,
|
||||
‘semeval14’, ‘semeval15’, ‘semeval16’, ‘sst’, ‘wa’, ‘wb’</p></li>
|
||||
<li><p><strong>for_model_selection</strong> – if True, then returns the train split as the training set and the devel split
|
||||
as the test set; if False, then returns the train+devel split as the training set and the test set as the
|
||||
test set</p></li>
|
||||
<li><p><strong>min_df</strong> – minimun number of documents that should contain a term in order for the term to be kept</p></li>
|
||||
<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)</p></li>
|
||||
<li><p><strong>pickle</strong> – set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
||||
faster subsequent invokations</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<p>‘semeval14’, ‘semeval15’, ‘semeval16’, ‘sst’, ‘wa’, ‘wb’
|
||||
:param for_model_selection: if True, then returns the train split as the training set and the devel split
|
||||
as the test set; if False, then returns the train+devel split as the training set and the test set as the
|
||||
test set
|
||||
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
|
||||
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
||||
faster subsequent invokations
|
||||
:return: a Dataset instance</p>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
|
@ -278,15 +696,41 @@ faster subsequent invokations
|
|||
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer">
|
||||
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">IndexTransformer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||||
<p>This class implements a sklearn’s-style transformer that indexes text as numerical ids for the tokens it
|
||||
contains, and that would be generated by sklearn’s
|
||||
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>kwargs</strong> – <p>keyworded arguments from <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
|
||||
</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.add_word">
|
||||
<span class="sig-name descname"><span class="pre">add_word</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">word</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nogaps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.add_word" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
|
||||
Useful to define special tokens for codifying unknown words, or padding tokens.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>word</strong> – string, surface form of the token</p></li>
|
||||
<li><p><strong>id</strong> – integer, numerical value to assign to the token (leave as None for indicating the next valid id,
|
||||
default)</p></li>
|
||||
<li><p><strong>nogaps</strong> – if set to True (default) asserts that the id indicated leads to no numerical gaps with
|
||||
precedent ids stored so far</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>integer, the numerical id for the new token</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit">
|
||||
<span class="sig-name descname"><span class="pre">fit</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><dl class="field-list simple">
|
||||
<dd><p>Fits the transformer, i.e., decides on the vocabulary, given a list of strings.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>X</strong> – a list of strings</p>
|
||||
</dd>
|
||||
|
@ -299,66 +743,139 @@ faster subsequent invokations
|
|||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit_transform">
|
||||
<span class="sig-name descname"><span class="pre">fit_transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit_transform" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.index">
|
||||
<span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">documents</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Fits the transform on <cite>X</cite> and transforms it.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>X</strong> – a list of strings</p></li>
|
||||
<li><p><strong>n_jobs</strong> – the number of parallel workers to carry out this task</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.transform">
|
||||
<span class="sig-name descname"><span class="pre">transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-</span> <span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.transform" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Transforms the strings in <cite>X</cite> as lists of numerical ids</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>X</strong> – a list of strings</p></li>
|
||||
<li><p><strong>n_jobs</strong> – the number of parallel workers to carry out this task</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.vocabulary_size">
|
||||
<span class="sig-name descname"><span class="pre">vocabulary_size</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.vocabulary_size" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Gets the length of the vocabulary according to which the document tokens have been indexed</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>integer</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.index">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.index" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
|
||||
Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
|
||||
:param dataset: a Dataset where the instances are lists of str
|
||||
:param min_df: minimum number of instances below which the term is replaced by a UNK index
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
|
||||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
consisting of lists of integer values representing indices.</p>
|
||||
<dd><p>Indexes the tokens of a textual <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of string documents.
|
||||
To index a document means to replace each different token by a unique numerical index.
|
||||
Rare words (i.e., words occurring less than <cite>min_df</cite> times) are replaced by a special token <cite>UNK</cite></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object where the instances of training and test documents
|
||||
are lists of str</p></li>
|
||||
<li><p><strong>min_df</strong> – minimum number of occurrences below which the term is replaced by a <cite>UNK</cite> index</p></li>
|
||||
<li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
|
||||
<li><p><strong>kwargs</strong> – the rest of parameters of the transformation (as for sklearn’s</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
<p><cite>CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_</cite>)
|
||||
:return: a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current</p>
|
||||
<blockquote>
|
||||
<div><p><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) consisting of lists of integer values representing indices.</p>
|
||||
</div></blockquote>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.reduce_columns">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">reduce_columns</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.reduce_columns" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
|
||||
_min_df_ instances
|
||||
:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
|
||||
:param min_df: minimum number of instances below which the columns are removed
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
where the dimensions corresponding to infrequent instances have been removed</p>
|
||||
<dd><p>Reduces the dimensionality of the instances, represented as a <cite>csr_matrix</cite> (or any subtype of
|
||||
<cite>scipy.sparse.spmatrix</cite>), of training and test documents by removing the columns of words which are not present
|
||||
in at least <cite>min_df</cite> instances in the training set</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in which instances are represented in sparse format (any
|
||||
subtype of scipy.sparse.spmatrix)</p></li>
|
||||
<li><p><strong>min_df</strong> – integer, minimum number of instances below which the columns are removed</p></li>
|
||||
<li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current
|
||||
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) where the dimensions corresponding to infrequent terms
|
||||
in the training set have been removed</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.standardize">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Standardizes the real-valued columns of a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>.
|
||||
Standardization, aka z-scoring, of a variable <cite>X</cite> comes down to subtracting the average and normalizing by the
|
||||
standard deviation.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object</p></li>
|
||||
<li><p><strong>inplace</strong> – set to True if the transformation is to be applied inplace, or to False (default) if a new
|
||||
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> is to be returned</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.preprocessing.text2tfidf">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">text2tfidf</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">quapy.data.base.Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sublinear_tf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.preprocessing.text2tfidf" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
|
||||
:param dataset: a Dataset where the instances are lists of str
|
||||
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
|
||||
:param sublinear_tf: whether or not to apply the log scalling to the tf counters
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
|
||||
:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
|
||||
<dd><p>Transforms a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of textual instances into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of
|
||||
tfidf weighted sparse vectors</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dataset</strong> – a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> where the instances of training and test collections are
|
||||
lists of str</p></li>
|
||||
<li><p><strong>min_df</strong> – minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)</p></li>
|
||||
<li><p><strong>sublinear_tf</strong> – whether or not to apply the log scalling to the tf counters (default True)</p></li>
|
||||
<li><p><strong>inplace</strong> – whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
|
||||
<li><p><strong>kwargs</strong> – the rest of parameters of the transformation (as for sklearn’s
|
||||
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html">TfidfVectorizer</a>)</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in <cite>csr_matrix</cite> format (if inplace=False) or a reference to the
|
||||
current Dataset (if inplace=True) where the instances are stored in a <cite>csr_matrix</cite> of real-valued tfidf scores</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</section>
|
||||
|
@ -367,7 +884,24 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
|
|||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.reader.binarize">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">binarize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pos_class</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.binarize" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Binarizes a categorical array-like collection of labels towards the positive class <cite>pos_class</cite>. E.g.,:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">binarize</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">array</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">])</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>y</strong> – array-like of labels</p></li>
|
||||
<li><p><strong>pos_class</strong> – integer, the positive class</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a binary np.ndarray, in which values 1 corresponds to positions in whcih <cite>y</cite> had <cite>pos_class</cite> labels, and
|
||||
0 otherwise</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.data.reader.from_csv">
|
||||
|
@ -376,10 +910,13 @@ where the instances are stored in a csr_matrix of real-valued tfidf scores</p>
|
|||
File format <label>,<feat1>,<feat2>,…,<featn></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>path</strong> – path to the csv file</p>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>path</strong> – path to the csv file</p></li>
|
||||
<li><p><strong>encoding</strong> – the text encoding used to open the file</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a ndarray for the labels and a ndarray (float) for the covariates</p>
|
||||
<dd class="field-even"><p>a np.ndarray for the labels and a ndarray (float) for the covariates</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
@ -394,7 +931,7 @@ File format <-1 or 0 or 1>[s col(int):val(float)]</p>
|
|||
<dd class="field-odd"><p><strong>path</strong> – path to the labelled collection</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a csr_matrix containing the instances (rows), and a ndarray containing the labels</p>
|
||||
<dd class="field-even"><p>a <cite>csr_matrix</cite> containing the instances (rows), and a ndarray containing the labels</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
@ -406,7 +943,11 @@ File format <-1 or 0 or 1>[s col(int):val(float)]</p>
|
|||
File fomart <0 or 1> <document></p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>path</strong> – path to the labelled collection</p>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>path</strong> – path to the labelled collection</p></li>
|
||||
<li><p><strong>encoding</strong> – the text encoding used to open the file</p></li>
|
||||
<li><p><strong>verbose</strong> – if >0 (default) shows some progress information in standard output</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a list of sentences, and a list of labels</p>
|
||||
|
@ -418,9 +959,19 @@ File fomart <0 or 1> <document></p>
|
|||
<dt class="sig sig-object py" id="quapy.data.reader.reindex_labels">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">reindex_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.data.reader.reindex_labels" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
|
||||
E.g., y=[‘B’, ‘B’, ‘A’, ‘C’] -> [1,1,0,2], [‘A’,’B’,’C’]
|
||||
:param y: the list or array of original labels
|
||||
:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
|
||||
E.g.:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">reindex_labels</span><span class="p">([</span><span class="s1">'B'</span><span class="p">,</span> <span class="s1">'B'</span><span class="p">,</span> <span class="s1">'A'</span><span class="p">,</span> <span class="s1">'C'</span><span class="p">])</span>
|
||||
<span class="gp">>>> </span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">]),</span> <span class="n">array</span><span class="p">([</span><span class="s1">'A'</span><span class="p">,</span> <span class="s1">'B'</span><span class="p">,</span> <span class="s1">'C'</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">'<U1'</span><span class="p">))</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>y</strong> – the list or array of original labels</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</section>
|
||||
|
|
|
@ -515,14 +515,20 @@ will be taken from the environment variable <cite>SAMPLE_SIZE</cite> (which has
|
|||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.evaluation.artificial_prevalence_prediction">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.evaluation.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_prediction</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.method.html#quapy.method.base.BaseQuantifier" title="quapy.method.base.BaseQuantifier"><span class="pre">quapy.method.base.BaseQuantifier</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><a class="reference internal" href="quapy.data.html#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">quapy.data.base.LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">sample_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevpoints</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">210</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_repetitions</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eval_budget</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">Optional</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">42</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.evaluation.artificial_prevalence_prediction" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.
|
||||
:param model: the model in charge of generating the class prevalence estimations
|
||||
:param test: the test set on which to perform arificial sampling
|
||||
:param sample_size: the size of the samples
|
||||
:param n_prevpoints: the number of different prevalences to sample (or set to None if eval_budget is specified)
|
||||
:param n_repetitions: the number of repetitions for each prevalence
|
||||
:param eval_budget: if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3
|
||||
classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
|
||||
<dd><p>Performs the predictions for all samples generated according to the artificial sampling protocol.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>model</strong> – the model in charge of generating the class prevalence estimations</p></li>
|
||||
<li><p><strong>test</strong> – the test set on which to perform arificial sampling</p></li>
|
||||
<li><p><strong>sample_size</strong> – the size of the samples</p></li>
|
||||
<li><p><strong>n_prevpoints</strong> – the number of different prevalences to sample (or set to None if eval_budget is specified)</p></li>
|
||||
<li><p><strong>n_repetitions</strong> – the number of repetitions for each prevalence</p></li>
|
||||
<li><p><strong>eval_budget</strong> – if specified, sets a ceil on the number of evaluations to perform. For example, if there are 3</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
<p>classes, n_repetitions=1 and eval_budget=20, then n_prevpoints will be set to 5, since this will generate 15
|
||||
different prevalences ([0, 0, 1], [0, 0.25, 0.75], [0, 0.5, 0.5] … [1, 0, 0]) and since setting it n_prevpoints
|
||||
to 6 would produce more than 20 evaluations.
|
||||
:param n_jobs: number of jobs to be run in parallel
|
||||
|
@ -601,7 +607,31 @@ contains the the prevalence estimations</p>
|
|||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.functional.artificial_prevalence_sampling">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">artificial_prevalence_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dimensions</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">return_constrained_dim</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.artificial_prevalence_sampling" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Generates vectors of prevalence values artificially drawn from an exhaustive grid of prevalence values. The
|
||||
number of prevalence values explored for each dimension depends on <cite>n_prevalences</cite>, so that, if, for example,
|
||||
<cite>n_prevalences=11</cite> then the prevalence values of the grid are taken from [0, 0.1, 0.2, …, 0.9, 1]. Only
|
||||
valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
|
||||
valid vector of prevalence values, <cite>repeat</cite> copies are returned. The vector of prevalence values can be
|
||||
implicit (by setting <cite>return_constrained_dim=False</cite>), meaning that the last dimension (which is constrained
|
||||
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to 1).</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>dimensions</strong> – the number of classes</p></li>
|
||||
<li><p><strong>n_prevalences</strong> – the number of equidistant prevalence points to extract from the [0,1] interval for the grid
|
||||
(default is 21)</p></li>
|
||||
<li><p><strong>repeat</strong> – number of copies for each valid prevalence vector (default is 1)</p></li>
|
||||
<li><p><strong>return_constrained_dim</strong> – set to True to return all dimensions, or to False (default) for ommitting the
|
||||
constrained dimension</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>an ndarray of shape <cite>(n, dimensions)</cite> if <cite>return_constrained_dim=True</cite> or of shape <cite>(n, dimensions-1)</cite>
|
||||
if <cite>return_constrained_dim=False</cite>, where <cite>n</cite> is the number of valid combinations found in the grid multiplied
|
||||
by <cite>repeat</cite></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.functional.get_nprevpoints_approximation">
|
||||
|
@ -634,8 +664,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
|
|||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.functional.prevalence_from_labels">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes_</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_from_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_from_labels" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Computed the prevalence values from a vector of labels.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>labels</strong> – array-like of shape <cite>(n_instances)</cite> with the label for each instance</p></li>
|
||||
<li><p><strong>classes</strong> – the class labels. This is needed in order to correctly compute the prevalence vector even when
|
||||
some classes have no examples.</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>an ndarray of shape <cite>(len(classes))</cite> with the class prevalence values</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.functional.prevalence_from_probabilities">
|
||||
|
@ -645,13 +688,21 @@ number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0
|
|||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.functional.prevalence_linspace">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.functional.</span></span><span class="sig-name descname"><span class="pre">prevalence_linspace</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">n_prevalences</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">21</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">repeat</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">smooth_limits_epsilon</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.01</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.functional.prevalence_linspace" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
|
||||
and with the limits smoothed, i.e.:
|
||||
[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]
|
||||
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
|
||||
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
|
||||
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
|
||||
:return: an array of uniformly separated prevalence values</p>
|
||||
<dd><p>Produces a uniformly separated values of prevalence. By default, produces an array of 21 prevalence values, with
|
||||
step 0.05 and with the limits smoothed, i.e.:
|
||||
[0.01, 0.05, 0.10, 0.15, …, 0.90, 0.95, 0.99]</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>n_prevalences</strong> – the number of prevalence values to sample from the [0,1] interval (default 21)</p></li>
|
||||
<li><p><strong>repeat</strong> – number of times each prevalence is to be repeated (defaults to 1)</p></li>
|
||||
<li><p><strong>smooth_limits_epsilon</strong> – the quantity to add and subtract to the limits 0 and 1</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>an array of uniformly separated prevalence values</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,6 +1,3 @@
|
|||
from abc import abstractmethod
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import vstack
|
||||
|
@ -9,18 +6,19 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
|||
from quapy.functional import artificial_prevalence_sampling, strprev
|
||||
|
||||
|
||||
|
||||
class LabelledCollection:
|
||||
'''
|
||||
A LabelledCollection is a set of objects each with a label associated to it.
|
||||
'''
|
||||
"""
|
||||
A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
|
||||
routines.
|
||||
|
||||
:param instances: array-like (np.ndarray, list, or csr_matrix are supported)
|
||||
:param labels: array-like with the same length of instances
|
||||
:param classes_: optional, list of classes from which labels are taken. If not specified, the classes are inferred
|
||||
from the labels. The classes must be indicated in cases in which some of the labels might have no examples
|
||||
(i.e., a prevalence of 0)
|
||||
"""
|
||||
|
||||
def __init__(self, instances, labels, classes_=None):
|
||||
"""
|
||||
:param instances: list of objects
|
||||
:param labels: list of labels, same length of instances
|
||||
:param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
|
||||
"""
|
||||
if issparse(instances):
|
||||
self.instances = instances
|
||||
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
|
||||
|
@ -42,28 +40,81 @@ class LabelledCollection:
|
|||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
|
||||
"""
|
||||
Loads a labelled set of data and convert it into a :class:`LabelledCollection` instance. The function in charge
|
||||
of reading the instances must be specified. This function can be a custom one, or any of the reading functions
|
||||
defined in :mod:`quapy.data.reader` module.
|
||||
|
||||
:param path: string, the path to the file containing the labelled instances
|
||||
:param loader_func: a custom function that implements the data loader and returns a tuple with instances and
|
||||
labels
|
||||
:param classes: array-like, the classes according to which the instances are labelled
|
||||
:param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances, i.e.,
|
||||
these arguments are used to call `loader_func(path, **loader_kwargs)`
|
||||
:return: a :class:`LabelledCollection` object
|
||||
"""
|
||||
return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Returns the length of this collection (number of labelled instances)
|
||||
|
||||
:return: integer
|
||||
"""
|
||||
return self.instances.shape[0]
|
||||
|
||||
def prevalence(self):
|
||||
"""
|
||||
Returns the prevalence, or relative frequency, of the classes of interest.
|
||||
|
||||
:return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order
|
||||
as listed by `self.classes_`
|
||||
"""
|
||||
return self.counts() / len(self)
|
||||
|
||||
def counts(self):
|
||||
"""
|
||||
Returns the number of instances for each of the classes of interest.
|
||||
|
||||
:return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order
|
||||
as listed by `self.classes_`
|
||||
"""
|
||||
return np.asarray([len(self.index[class_]) for class_ in self.classes_])
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
"""
|
||||
The number of classes
|
||||
|
||||
:return: integer
|
||||
"""
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
"""
|
||||
Returns True if the number of classes is 2
|
||||
|
||||
:return: boolean
|
||||
"""
|
||||
return self.n_classes == 2
|
||||
|
||||
def sampling_index(self, size, *prevs, shuffle=True):
|
||||
"""
|
||||
Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
|
||||
prevalence values are not specified, then returns the index of a uniform sampling.
|
||||
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
|
||||
the actual prevalence of the class, or with replacement otherwise.
|
||||
|
||||
:param size: integer, the requested size
|
||||
:param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
|
||||
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
|
||||
`self.classes_` can be specified, while the other class takes prevalence value `1-p`
|
||||
:param shuffle: if set to True (default), shuffles the index before returning it
|
||||
:return: a np.ndarray of shape `(size)` with the indexes
|
||||
"""
|
||||
if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling
|
||||
return np.random.choice(len(self), size, replace=False)
|
||||
return self.uniform_sampling_index(size)
|
||||
if len(prevs) == self.n_classes - 1:
|
||||
prevs = prevs + (1 - sum(prevs),)
|
||||
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
|
||||
|
@ -93,47 +144,142 @@ class LabelledCollection:
|
|||
return indexes_sample
|
||||
|
||||
def uniform_sampling_index(self, size):
|
||||
"""
|
||||
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
|
||||
without replacement if the requested size is greater than the number of instances, or with replacement
|
||||
otherwise.
|
||||
|
||||
:param size: integer, the size of the uniform sample
|
||||
:return: a np.ndarray of shape `(size)` with the indexes
|
||||
"""
|
||||
return np.random.choice(len(self), size, replace=False)
|
||||
|
||||
def uniform_sampling(self, size):
|
||||
unif_index = self.uniform_sampling_index(size)
|
||||
return self.sampling_from_index(unif_index)
|
||||
|
||||
def sampling(self, size, *prevs, shuffle=True):
|
||||
"""
|
||||
Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence
|
||||
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
|
||||
the actual prevalence of the class, or with replacement otherwise.
|
||||
|
||||
:param size: integer, the requested size
|
||||
:param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
|
||||
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
|
||||
`self.classes_` can be specified, while the other class takes prevalence value `1-p`
|
||||
:param shuffle: if set to True (default), shuffles the index before returning it
|
||||
:return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or
|
||||
prevalence == `prevs` if the exact prevalence values can be met as proportions of instances)
|
||||
"""
|
||||
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
|
||||
return self.sampling_from_index(prev_index)
|
||||
|
||||
def uniform_sampling(self, size):
|
||||
"""
|
||||
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
|
||||
without replacement if the requested size is greater than the number of instances, or with replacement
|
||||
otherwise.
|
||||
|
||||
:param size: integer, the requested size
|
||||
:return: an instance of :class:`LabelledCollection` with length == `size`
|
||||
"""
|
||||
unif_index = self.uniform_sampling_index(size)
|
||||
return self.sampling_from_index(unif_index)
|
||||
|
||||
def sampling_from_index(self, index):
|
||||
"""
|
||||
Returns an instance of :class:`LabelledCollection` whose elements are sampled from this collection using the
|
||||
index.
|
||||
|
||||
:param index: np.ndarray
|
||||
:return: an instance of :class:`LabelledCollection`
|
||||
"""
|
||||
documents = self.instances[index]
|
||||
labels = self.labels[index]
|
||||
return LabelledCollection(documents, labels, classes_=self.classes_)
|
||||
|
||||
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||
# with temp_seed(42):
|
||||
"""
|
||||
Returns two instances of :class:`LabelledCollection` split with stratification from this collection, at desired
|
||||
proportion.
|
||||
|
||||
:param train_prop: the proportion of elements to include in the left-most returned collection (typically used
|
||||
as the training collection). The rest of elements are included in the right-most returned collection
|
||||
(typically used as a test collection).
|
||||
:param random_state: if specified, guarantees reproducibility of the split.
|
||||
:return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
|
||||
second one with `1-train_prop` elements
|
||||
"""
|
||||
tr_docs, te_docs, tr_labels, te_labels = \
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
|
||||
random_state=random_state)
|
||||
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
||||
|
||||
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
|
||||
"""
|
||||
A generator of samples that implements the artificial prevalence protocol (APP). The APP consists of exploring
|
||||
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
|
||||
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
|
||||
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
|
||||
combination of prevalence values is indicated by `repeats`
|
||||
|
||||
:param sample_size: the number of instances in each sample
|
||||
:param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
|
||||
limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
|
||||
:param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
|
||||
:return: yield samples generated at artificially controlled prevalence values
|
||||
"""
|
||||
dimensions = self.n_classes
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||
yield self.sampling(sample_size, *prevs)
|
||||
|
||||
def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
|
||||
"""
|
||||
A generator of sample indexes implementing the artificial prevalence protocol (APP).
|
||||
The APP consists of exploring
|
||||
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
|
||||
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
|
||||
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of sample indexes for each valid
|
||||
combination of prevalence values is indicated by `repeats`
|
||||
|
||||
:param sample_size: the number of instances in each sample (i.e., length of each index)
|
||||
:param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
|
||||
limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
|
||||
:param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
|
||||
:return: yield the indexes that generate the samples according to APP
|
||||
"""
|
||||
dimensions = self.n_classes
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||
yield self.sampling_index(sample_size, *prevs)
|
||||
|
||||
def natural_sampling_generator(self, sample_size, repeats=100):
|
||||
"""
|
||||
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
|
||||
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
|
||||
|
||||
:param sample_size: integer, the number of instances in each sample
|
||||
:param repeats: the number of samples to generate
|
||||
:return: yield instances of :class:`LabelledCollection`
|
||||
"""
|
||||
for _ in range(repeats):
|
||||
yield self.uniform_sampling(sample_size)
|
||||
|
||||
def natural_sampling_index_generator(self, sample_size, repeats=100):
|
||||
"""
|
||||
A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
|
||||
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
|
||||
|
||||
:param sample_size: integer, the number of instances in each sample (i.e., the length of each index)
|
||||
:param repeats: the number of indexes to generate
|
||||
:return: yield `repeats` instances of np.ndarray with shape `(sample_size,)`
|
||||
"""
|
||||
for _ in range(repeats):
|
||||
yield self.uniform_sampling_index(sample_size)
|
||||
|
||||
def __add__(self, other):
|
||||
"""
|
||||
Returns a new :class:`LabelledCollection` as the union of this collection with another collection
|
||||
|
||||
:param other: another :class:`LabelledCollection`
|
||||
:return: a :class:`LabelledCollection` representing the union of both collections
|
||||
"""
|
||||
if other is None:
|
||||
return self
|
||||
elif issparse(self.instances) and issparse(other.instances):
|
||||
|
@ -149,9 +295,29 @@ class LabelledCollection:
|
|||
|
||||
@property
|
||||
def Xy(self):
|
||||
"""
|
||||
Gets the instances and labels. This is useful when working with `sklearn` estimators, e.g.:
|
||||
|
||||
>>> svm = LinearSVC().fit(*my_collection.Xy)
|
||||
|
||||
:return: a tuple `(instances, labels)` from this collection
|
||||
"""
|
||||
return self.instances, self.labels
|
||||
|
||||
def stats(self, show=True):
|
||||
"""
|
||||
Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:
|
||||
|
||||
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||
>>> data.training.stats()
|
||||
>>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]
|
||||
|
||||
:param show: if set to True (default), prints the stats in standard output
|
||||
:return: a dictionary containing some stats of this collection. Keys include `#instances` (the number of
|
||||
instances), `type` (the type representing the instances), `#features` (the number of features, if the
|
||||
instances are in array-like format), `#classes` (the classes of the collection), `prevs` (the prevalence
|
||||
values for each class)
|
||||
"""
|
||||
ninstances = len(self)
|
||||
instance_type = type(self.instances[0])
|
||||
if instance_type == list:
|
||||
|
@ -171,6 +337,14 @@ class LabelledCollection:
|
|||
return stats_
|
||||
|
||||
def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
|
||||
"""
|
||||
Generator of stratified folds to be used in k-fold cross validation.
|
||||
|
||||
:param nfolds: integer (default 5), the number of folds to generate
|
||||
:param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
|
||||
:param random_state: integer (default 0), guarantees that the folds generated are reproducible
|
||||
:return: yields `nfolds * nrepeats` folds for k-fold cross validation
|
||||
"""
|
||||
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
|
||||
for train_index, test_index in kf.split(*self.Xy):
|
||||
train = self.sampling_from_index(train_index)
|
||||
|
@ -178,8 +352,15 @@ class LabelledCollection:
|
|||
yield train, test
|
||||
|
||||
|
||||
|
||||
class Dataset:
|
||||
"""
|
||||
Abstraction of training and test :class:`LabelledCollection` objects.
|
||||
|
||||
:param training: a :class:`LabelledCollection` instance
|
||||
:param test: a :class:`LabelledCollection` instance
|
||||
:param vocabulary: if indicated, is a dictionary of the terms used in this textual dataset
|
||||
:param name: a string representing the name of the dataset
|
||||
"""
|
||||
|
||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
||||
assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
|
||||
|
@ -190,33 +371,91 @@ class Dataset:
|
|||
|
||||
@classmethod
|
||||
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
|
||||
"""
|
||||
Generates a :class:`Dataset` from a stratified split of a :class:`LabelledCollection` instance.
|
||||
See :meth:`LabelledCollection.split_stratified`
|
||||
|
||||
:param collection: :class:`LabelledCollection`
|
||||
:param train_size: the proportion of training documents (the rest conforms the test split)
|
||||
:return: an instance of :class:`Dataset`
|
||||
"""
|
||||
return Dataset(*collection.split_stratified(train_prop=train_size))
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
"""
|
||||
The classes according to which the training collection is labelled
|
||||
|
||||
:return: The classes according to which the training collection is labelled
|
||||
"""
|
||||
return self.training.classes_
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
"""
|
||||
The number of classes according to which the training collection is labelled
|
||||
|
||||
:return: integer
|
||||
"""
|
||||
return self.training.n_classes
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
"""
|
||||
Returns True if the training collection is labelled according to two classes
|
||||
|
||||
:return: boolean
|
||||
"""
|
||||
return self.training.binary
|
||||
|
||||
@classmethod
|
||||
def load(cls, train_path, test_path, loader_func: callable):
|
||||
training = LabelledCollection.load(train_path, loader_func)
|
||||
test = LabelledCollection.load(test_path, loader_func)
|
||||
def load(cls, train_path, test_path, loader_func: callable, classes=None, **loader_kwargs):
|
||||
"""
|
||||
Loads a training and a test labelled set of data and convert it into a :class:`Dataset` instance.
|
||||
The function in charge of reading the instances must be specified. This function can be a custom one, or any of
|
||||
the reading functions defined in :mod:`quapy.data.reader` module.
|
||||
|
||||
:param train_path: string, the path to the file containing the training instances
|
||||
:param test_path: string, the path to the file containing the test instances
|
||||
:param loader_func: a custom function that implements the data loader and returns a tuple with instances and
|
||||
labels
|
||||
:param classes: array-like, the classes according to which the instances are labelled
|
||||
:param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances.
|
||||
See :meth:`LabelledCollection.load` for further details.
|
||||
:return: a :class:`Dataset` object
|
||||
"""
|
||||
|
||||
training = LabelledCollection.load(train_path, loader_func, classes, **loader_kwargs)
|
||||
test = LabelledCollection.load(test_path, loader_func, classes, **loader_kwargs)
|
||||
return Dataset(training, test)
|
||||
|
||||
@property
|
||||
def vocabulary_size(self):
|
||||
"""
|
||||
If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary
|
||||
|
||||
:return: integer
|
||||
"""
|
||||
return len(self.vocabulary)
|
||||
|
||||
def stats(self):
|
||||
def stats(self, show):
|
||||
"""
|
||||
Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:
|
||||
|
||||
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||
>>> data.stats()
|
||||
>>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]
|
||||
|
||||
:param show: if set to True (default), prints the stats in standard output
|
||||
:return: a dictionary containing some stats of this collection for the training and test collections. The keys
|
||||
are `train` and `test`, and point to dedicated dictionaries of stats, for each collection, with keys
|
||||
`#instances` (the number of instances), `type` (the type representing the instances),
|
||||
`#features` (the number of features, if the instances are in array-like format), `#classes` (the classes of
|
||||
the collection), `prevs` (the prevalence values for each class)
|
||||
"""
|
||||
tr_stats = self.training.stats(show=False)
|
||||
te_stats = self.test.stats(show=False)
|
||||
if show:
|
||||
print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
|
||||
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
|
||||
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
|
||||
|
@ -224,11 +463,26 @@ class Dataset:
|
|||
|
||||
@classmethod
|
||||
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
|
||||
"""
|
||||
Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
|
||||
:meth:`LabelledCollection.kFCV` that returns :class:`Dataset` instances made of training and test folds.
|
||||
|
||||
:param nfolds: integer (default 5), the number of folds to generate
|
||||
:param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
|
||||
:param random_state: integer (default 0), guarantees that the folds generated are reproducible
|
||||
:return: yields `nfolds * nrepeats` folds for k-fold cross validation as instances of :class:`Dataset`
|
||||
"""
|
||||
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
|
||||
yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
|
||||
|
||||
|
||||
def isbinary(data):
|
||||
"""
|
||||
Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
|
||||
|
||||
:param data: a :class:`Dataset` or a :class:`LabelledCollection` object
|
||||
:return: True if labelled according to two classes
|
||||
"""
|
||||
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
||||
return data.binary
|
||||
return False
|
||||
|
|
|
@ -5,9 +5,6 @@ warnings.warn = warn
|
|||
import os
|
||||
import zipfile
|
||||
from os.path import join
|
||||
from urllib.error import HTTPError
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from quapy.data.base import Dataset, LabelledCollection
|
||||
|
@ -49,9 +46,11 @@ UCI_DATASETS = ['acute.a', 'acute.b',
|
|||
|
||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
||||
"""
|
||||
Load a Reviews dataset as a Dataset instance, as used in:
|
||||
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
|
||||
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
|
||||
Loads a Reviews dataset as a Dataset instance, as used in
|
||||
`Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
|
||||
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
|
||||
The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
|
||||
|
||||
:param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
|
||||
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
|
||||
:param min_df: minimun number of documents that should contain a term in order for the term to be
|
||||
|
@ -60,7 +59,7 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
|
|||
~/quay_data/ directory)
|
||||
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
||||
faster subsequent invokations
|
||||
:return: a Dataset instance
|
||||
:return: a :class:`quapy.data.base.Dataset` instance
|
||||
"""
|
||||
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
|
||||
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
|
||||
|
@ -93,10 +92,13 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
|
|||
|
||||
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
||||
"""
|
||||
Load a Twitter dataset as a Dataset instance, as used in:
|
||||
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||
The datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
|
||||
Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
|
||||
`Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
|
||||
Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
|
||||
The list of valid dataset names corresponding to training sets can be accessed in
|
||||
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
|
||||
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`
|
||||
|
||||
:param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
|
||||
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
|
||||
|
@ -108,7 +110,7 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
|||
~/quay_data/ directory)
|
||||
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
||||
faster subsequent invokations
|
||||
:return: a Dataset instance
|
||||
:return: a :class:`quapy.data.base.Dataset` instance
|
||||
"""
|
||||
assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
|
||||
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
|
||||
|
@ -163,11 +165,58 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
|||
|
||||
|
||||
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
||||
"""
|
||||
Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
|
||||
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
||||
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
|
||||
and
|
||||
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
||||
Dynamic ensemble selection for quantification tasks.
|
||||
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
|
||||
The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
|
||||
information on how to use these collections), and so a train-test split is generated at desired proportion.
|
||||
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
|
||||
|
||||
:param dataset_name: a dataset name
|
||||
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
||||
:return: a :class:`quapy.data.base.Dataset` instance
|
||||
"""
|
||||
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
|
||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||
|
||||
|
||||
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
|
||||
"""
|
||||
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
|
||||
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
||||
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
|
||||
and
|
||||
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
||||
Dynamic ensemble selection for quantification tasks.
|
||||
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
|
||||
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
|
||||
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
|
||||
This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
|
||||
|
||||
>>> import quapy as qp
|
||||
>>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
|
||||
>>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
|
||||
>>> ...
|
||||
|
||||
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
|
||||
|
||||
:param dataset_name: a dataset name
|
||||
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
||||
:return: a :class:`quapy.data.base.Dataset` instance
|
||||
"""
|
||||
|
||||
assert dataset_name in UCI_DATASETS, \
|
||||
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
||||
|
@ -302,7 +351,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
|
|||
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
|
||||
|
||||
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
|
||||
[df_replace(df, col) for col in range(1, 6)]
|
||||
[_df_replace(df, col) for col in range(1, 6)]
|
||||
X = df.loc[:, 0:5].values
|
||||
if dataset_name == 'acute.a':
|
||||
y = binarize(df[6], pos_class='yes')
|
||||
|
@ -482,5 +531,5 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
|
|||
return data
|
||||
|
||||
|
||||
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
||||
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
||||
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
|
|
@ -12,14 +12,18 @@ from .base import LabelledCollection
|
|||
|
||||
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
||||
"""
|
||||
Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
|
||||
:param dataset: a Dataset where the instances are lists of str
|
||||
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
|
||||
:param sublinear_tf: whether or not to apply the log scalling to the tf counters
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
|
||||
:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
where the instances are stored in a csr_matrix of real-valued tfidf scores
|
||||
Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of
|
||||
tfidf weighted sparse vectors
|
||||
|
||||
:param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are
|
||||
lists of str
|
||||
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)
|
||||
:param sublinear_tf: whether or not to apply the log scalling to the tf counters (default True)
|
||||
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
|
||||
:param kwargs: the rest of parameters of the transformation (as for sklearn's
|
||||
`TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_)
|
||||
:return: a new :class:`quapy.data.base.Dataset` in `csr_matrix` format (if inplace=False) or a reference to the
|
||||
current Dataset (if inplace=True) where the instances are stored in a `csr_matrix` of real-valued tfidf scores
|
||||
"""
|
||||
__check_type(dataset.training.instances, np.ndarray, str)
|
||||
__check_type(dataset.test.instances, np.ndarray, str)
|
||||
|
@ -41,13 +45,17 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
|
|||
|
||||
def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
|
||||
"""
|
||||
Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
|
||||
_min_df_ instances
|
||||
:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
|
||||
:param min_df: minimum number of instances below which the columns are removed
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
where the dimensions corresponding to infrequent instances have been removed
|
||||
Reduces the dimensionality of the instances, represented as a `csr_matrix` (or any subtype of
|
||||
`scipy.sparse.spmatrix`), of training and test documents by removing the columns of words which are not present
|
||||
in at least `min_df` instances in the training set
|
||||
|
||||
:param dataset: a :class:`quapy.data.base.Dataset` in which instances are represented in sparse format (any
|
||||
subtype of scipy.sparse.spmatrix)
|
||||
:param min_df: integer, minimum number of instances below which the columns are removed
|
||||
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
|
||||
:return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
|
||||
:class:`quapy.data.base.Dataset` (inplace=True) where the dimensions corresponding to infrequent terms
|
||||
in the training set have been removed
|
||||
"""
|
||||
__check_type(dataset.training.instances, spmatrix)
|
||||
__check_type(dataset.test.instances, spmatrix)
|
||||
|
@ -71,7 +79,17 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
|
|||
return Dataset(training, test)
|
||||
|
||||
|
||||
def standardize(dataset: Dataset, inplace=True):
|
||||
def standardize(dataset: Dataset, inplace=False):
|
||||
"""
|
||||
Standardizes the real-valued columns of a :class:`quapy.data.base.Dataset`.
|
||||
Standardization, aka z-scoring, of a variable `X` comes down to subtracting the average and normalizing by the
|
||||
standard deviation.
|
||||
|
||||
:param dataset: a :class:`quapy.data.base.Dataset` object
|
||||
:param inplace: set to True if the transformation is to be applied inplace, or to False (default) if a new
|
||||
:class:`quapy.data.base.Dataset` is to be returned
|
||||
:return:
|
||||
"""
|
||||
s = StandardScaler(copy=not inplace)
|
||||
training = s.fit_transform(dataset.training.instances)
|
||||
test = s.transform(dataset.test.instances)
|
||||
|
@ -83,14 +101,18 @@ def standardize(dataset: Dataset, inplace=True):
|
|||
|
||||
def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
|
||||
"""
|
||||
Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
|
||||
Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
|
||||
:param dataset: a Dataset where the instances are lists of str
|
||||
:param min_df: minimum number of instances below which the term is replaced by a UNK index
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
|
||||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
consisting of lists of integer values representing indices.
|
||||
Indexes the tokens of a textual :class:`quapy.data.base.Dataset` of string documents.
|
||||
To index a document means to replace each different token by a unique numerical index.
|
||||
Rare words (i.e., words occurring less than `min_df` times) are replaced by a special token `UNK`
|
||||
|
||||
:param dataset: a :class:`quapy.data.base.Dataset` object where the instances of training and test documents
|
||||
are lists of str
|
||||
:param min_df: minimum number of occurrences below which the term is replaced by a `UNK` index
|
||||
:param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default)
|
||||
:param kwargs: the rest of parameters of the transformation (as for sklearn's
|
||||
`CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_`)
|
||||
:return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current
|
||||
:class:`quapy.data.base.Dataset` (inplace=True) consisting of lists of integer values representing indices.
|
||||
"""
|
||||
__check_type(dataset.training.instances, np.ndarray, str)
|
||||
__check_type(dataset.test.instances, np.ndarray, str)
|
||||
|
@ -120,17 +142,23 @@ def __check_type(container, container_type=None, element_type=None):
|
|||
|
||||
|
||||
class IndexTransformer:
|
||||
"""
|
||||
This class implements a sklearn's-style transformer that indexes text as numerical ids for the tokens it
|
||||
contains, and that would be generated by sklearn's
|
||||
`CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
|
||||
|
||||
:param kwargs: keyworded arguments from `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
:param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
|
||||
"""
|
||||
self.vect = CountVectorizer(**kwargs)
|
||||
self.unk = -1 # a valid index is assigned after fit
|
||||
self.pad = -2 # a valid index is assigned after fit
|
||||
|
||||
def fit(self, X):
|
||||
"""
|
||||
Fits the transformer, i.e., decides on the vocabulary, given a list of strings.
|
||||
|
||||
:param X: a list of strings
|
||||
:return: self
|
||||
"""
|
||||
|
@ -142,22 +170,52 @@ class IndexTransformer:
|
|||
return self
|
||||
|
||||
def transform(self, X, n_jobs=-1):
|
||||
"""
|
||||
Transforms the strings in `X` as lists of numerical ids
|
||||
|
||||
:param X: a list of strings
|
||||
:param n_jobs: the number of parallel workers to carry out this task
|
||||
:return: a `np.ndarray` of numerical ids
|
||||
"""
|
||||
# given the number of tasks and the number of jobs, generates the slices for the parallel processes
|
||||
assert self.unk != -1, 'transform called before fit'
|
||||
indexed = map_parallel(func=self.index, args=X, n_jobs=n_jobs)
|
||||
indexed = map_parallel(func=self._index, args=X, n_jobs=n_jobs)
|
||||
return np.asarray(indexed)
|
||||
|
||||
def index(self, documents):
|
||||
def _index(self, documents):
|
||||
vocab = self.vocabulary_.copy()
|
||||
return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
|
||||
|
||||
def fit_transform(self, X, n_jobs=-1):
|
||||
"""
|
||||
Fits the transform on `X` and transforms it.
|
||||
|
||||
:param X: a list of strings
|
||||
:param n_jobs: the number of parallel workers to carry out this task
|
||||
:return: a `np.ndarray` of numerical ids
|
||||
"""
|
||||
return self.fit(X).transform(X, n_jobs=n_jobs)
|
||||
|
||||
def vocabulary_size(self):
|
||||
"""
|
||||
Gets the length of the vocabulary according to which the document tokens have been indexed
|
||||
|
||||
:return: integer
|
||||
"""
|
||||
return len(self.vocabulary_)
|
||||
|
||||
def add_word(self, word, id=None, nogaps=True):
|
||||
"""
|
||||
Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
|
||||
Useful to define special tokens for codifying unknown words, or padding tokens.
|
||||
|
||||
:param word: string, surface form of the token
|
||||
:param id: integer, numerical value to assign to the token (leave as None for indicating the next valid id,
|
||||
default)
|
||||
:param nogaps: if set to True (default) asserts that the id indicated leads to no numerical gaps with
|
||||
precedent ids stored so far
|
||||
:return: integer, the numerical id for the new token
|
||||
"""
|
||||
if word in self.vocabulary_:
|
||||
raise ValueError(f'word {word} already in dictionary')
|
||||
if id is None:
|
||||
|
|
|
@ -7,7 +7,10 @@ def from_text(path, encoding='utf-8', verbose=1, class2int=True):
|
|||
"""
|
||||
Reads a labelled colletion of documents.
|
||||
File fomart <0 or 1>\t<document>\n
|
||||
|
||||
:param path: path to the labelled collection
|
||||
:param encoding: the text encoding used to open the file
|
||||
:param verbose: if >0 (default) shows some progress information in standard output
|
||||
:return: a list of sentences, and a list of labels
|
||||
"""
|
||||
all_sentences, all_labels = [], []
|
||||
|
@ -35,8 +38,9 @@ def from_sparse(path):
|
|||
"""
|
||||
Reads a labelled collection of real-valued instances expressed in sparse format
|
||||
File format <-1 or 0 or 1>[\s col(int):val(float)]\n
|
||||
|
||||
:param path: path to the labelled collection
|
||||
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
|
||||
:return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels
|
||||
"""
|
||||
|
||||
def split_col_val(col_val):
|
||||
|
@ -68,8 +72,10 @@ def from_csv(path, encoding='utf-8'):
|
|||
"""
|
||||
Reads a csv file in which columns are separated by ','.
|
||||
File format <label>,<feat1>,<feat2>,...,<featn>\n
|
||||
|
||||
:param path: path to the csv file
|
||||
:return: a ndarray for the labels and a ndarray (float) for the covariates
|
||||
:param encoding: the text encoding used to open the file
|
||||
:return: a np.ndarray for the labels and a ndarray (float) for the covariates
|
||||
"""
|
||||
|
||||
X, y = [], []
|
||||
|
@ -85,11 +91,16 @@ def from_csv(path, encoding='utf-8'):
|
|||
def reindex_labels(y):
|
||||
"""
|
||||
Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
|
||||
E.g., y=['B', 'B', 'A', 'C'] -> [1,1,0,2], ['A','B','C']
|
||||
E.g.:
|
||||
|
||||
>>> reindex_labels(['B', 'B', 'A', 'C'])
|
||||
>>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))
|
||||
|
||||
:param y: the list or array of original labels
|
||||
:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.
|
||||
"""
|
||||
classnames = sorted(np.unique(y))
|
||||
y = np.asarray(y)
|
||||
classnames = np.asarray(sorted(np.unique(y)))
|
||||
label2index = {label: index for index, label in enumerate(classnames)}
|
||||
indexed = np.empty(y.shape, dtype=np.int)
|
||||
for label in classnames:
|
||||
|
@ -98,6 +109,17 @@ def reindex_labels(y):
|
|||
|
||||
|
||||
def binarize(y, pos_class):
|
||||
"""
|
||||
Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,:
|
||||
|
||||
>>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)
|
||||
>>> array([0, 1, 0, 0, 0, 0])
|
||||
|
||||
:param y: array-like of labels
|
||||
:param pos_class: integer, the positive class
|
||||
:return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and
|
||||
0 otherwise
|
||||
"""
|
||||
y = np.asarray(y)
|
||||
ybin = np.zeros(y.shape, dtype=np.int)
|
||||
ybin[y == pos_class] = 1
|
||||
|
|
Loading…
Reference in New Issue