forked from moreo/QuaPy
831 lines
48 KiB
HTML
831 lines
48 KiB
HTML
|
||
|
||
<!doctype html>
|
||
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
|
||
|
||
<title>Datasets — QuaPy 0.1.7 documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="_static/pygments.css" />
|
||
<link rel="stylesheet" type="text/css" href="_static/bizstyle.css" />
|
||
|
||
<script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
|
||
<script src="_static/jquery.js"></script>
|
||
<script src="_static/underscore.js"></script>
|
||
<script src="_static/_sphinx_javascript_frameworks_compat.js"></script>
|
||
<script src="_static/doctools.js"></script>
|
||
<script src="_static/sphinx_highlight.js"></script>
|
||
<script src="_static/bizstyle.js"></script>
|
||
<link rel="index" title="Index" href="genindex.html" />
|
||
<link rel="search" title="Search" href="search.html" />
|
||
<link rel="next" title="Evaluation" href="Evaluation.html" />
|
||
<link rel="prev" title="Installation" href="Installation.html" />
|
||
<meta name="viewport" content="width=device-width,initial-scale=1.0" />
|
||
<!--[if lt IE 9]>
|
||
<script src="_static/css3-mediaqueries.js"></script>
|
||
<![endif]-->
|
||
</head><body>
|
||
<div class="related" role="navigation" aria-label="related navigation">
|
||
<h3>Navigation</h3>
|
||
<ul>
|
||
<li class="right" style="margin-right: 10px">
|
||
<a href="genindex.html" title="General Index"
|
||
accesskey="I">index</a></li>
|
||
<li class="right" >
|
||
<a href="py-modindex.html" title="Python Module Index"
|
||
>modules</a> |</li>
|
||
<li class="right" >
|
||
<a href="Evaluation.html" title="Evaluation"
|
||
accesskey="N">next</a> |</li>
|
||
<li class="right" >
|
||
<a href="Installation.html" title="Installation"
|
||
accesskey="P">previous</a> |</li>
|
||
<li class="nav-item nav-item-0"><a href="index.html">QuaPy 0.1.7 documentation</a> »</li>
|
||
<li class="nav-item nav-item-this"><a href="">Datasets</a></li>
|
||
</ul>
|
||
</div>
|
||
|
||
<div class="document">
|
||
<div class="documentwrapper">
|
||
<div class="bodywrapper">
|
||
<div class="body" role="main">
|
||
|
||
<section id="datasets">
|
||
<h1>Datasets<a class="headerlink" href="#datasets" title="Permalink to this heading">¶</a></h1>
|
||
<p>QuaPy makes available several datasets that have been used in
|
||
quantification literature, as well as an interface to allow
|
||
anyone import their custom datasets.</p>
|
||
<p>A <em>Dataset</em> object in QuaPy is roughly a pair of <em>LabelledCollection</em> objects,
|
||
one playing the role of the training set, another the test set.
|
||
<em>LabelledCollection</em> is a data class consisting of the (iterable)
|
||
instances and labels. This class handles most of the sampling functionality in QuaPy.
|
||
Take a look at the following code:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
|
||
<span class="kn">import</span> <span class="nn">quapy.functional</span> <span class="k">as</span> <span class="nn">F</span>
|
||
|
||
<span class="n">instances</span> <span class="o">=</span> <span class="p">[</span>
|
||
<span class="s1">'1st positive document'</span><span class="p">,</span> <span class="s1">'2nd positive document'</span><span class="p">,</span>
|
||
<span class="s1">'the only negative document'</span><span class="p">,</span>
|
||
<span class="s1">'1st neutral document'</span><span class="p">,</span> <span class="s1">'2nd neutral document'</span><span class="p">,</span> <span class="s1">'3rd neutral document'</span>
|
||
<span class="p">]</span>
|
||
<span class="n">labels</span> <span class="o">=</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span>
|
||
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">LabelledCollection</span><span class="p">(</span><span class="n">instances</span><span class="p">,</span> <span class="n">labels</span><span class="p">)</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">strprev</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">prevalence</span><span class="p">(),</span> <span class="n">prec</span><span class="o">=</span><span class="mi">2</span><span class="p">))</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Output the class prevalences (showing 2 digit precision):</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[</span><span class="mf">0.17</span><span class="p">,</span> <span class="mf">0.50</span><span class="p">,</span> <span class="mf">0.33</span><span class="p">]</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>One can easily produce new samples at desired class prevalence values:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">sample_size</span> <span class="o">=</span> <span class="mi">10</span>
|
||
<span class="n">prev</span> <span class="o">=</span> <span class="p">[</span><span class="mf">0.4</span><span class="p">,</span> <span class="mf">0.1</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">]</span>
|
||
<span class="n">sample</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">sampling</span><span class="p">(</span><span class="n">sample_size</span><span class="p">,</span> <span class="o">*</span><span class="n">prev</span><span class="p">)</span>
|
||
|
||
<span class="nb">print</span><span class="p">(</span><span class="s1">'instances:'</span><span class="p">,</span> <span class="n">sample</span><span class="o">.</span><span class="n">instances</span><span class="p">)</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="s1">'labels:'</span><span class="p">,</span> <span class="n">sample</span><span class="o">.</span><span class="n">labels</span><span class="p">)</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="s1">'prevalence:'</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">strprev</span><span class="p">(</span><span class="n">sample</span><span class="o">.</span><span class="n">prevalence</span><span class="p">(),</span> <span class="n">prec</span><span class="o">=</span><span class="mi">2</span><span class="p">))</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Which outputs:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">instances</span><span class="p">:</span> <span class="p">[</span><span class="s1">'the only negative document'</span> <span class="s1">'2nd positive document'</span>
|
||
<span class="s1">'2nd positive document'</span> <span class="s1">'2nd neutral document'</span> <span class="s1">'1st positive document'</span>
|
||
<span class="s1">'the only negative document'</span> <span class="s1">'the only negative document'</span>
|
||
<span class="s1">'the only negative document'</span> <span class="s1">'2nd positive document'</span>
|
||
<span class="s1">'1st positive document'</span><span class="p">]</span>
|
||
<span class="n">labels</span><span class="p">:</span> <span class="p">[</span><span class="mi">0</span> <span class="mi">2</span> <span class="mi">2</span> <span class="mi">1</span> <span class="mi">2</span> <span class="mi">0</span> <span class="mi">0</span> <span class="mi">0</span> <span class="mi">2</span> <span class="mi">2</span><span class="p">]</span>
|
||
<span class="n">prevalence</span><span class="p">:</span> <span class="p">[</span><span class="mf">0.40</span><span class="p">,</span> <span class="mf">0.10</span><span class="p">,</span> <span class="mf">0.50</span><span class="p">]</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Samples can be made consistent across different runs (e.g., to test
|
||
different methods on the same exact samples) by sampling and retaining
|
||
the indexes, that can then be used to generate the sample:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">index</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">sampling_index</span><span class="p">(</span><span class="n">sample_size</span><span class="p">,</span> <span class="o">*</span><span class="n">prev</span><span class="p">)</span>
|
||
<span class="k">for</span> <span class="n">method</span> <span class="ow">in</span> <span class="n">methods</span><span class="p">:</span>
|
||
<span class="n">sample</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">sampling_from_index</span><span class="p">(</span><span class="n">index</span><span class="p">)</span>
|
||
<span class="o">...</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>However, generating samples for evaluation purposes is tackled in QuaPy
|
||
by means of the evaluation protocols (see the dedicated entries in the Wiki
|
||
for <a class="reference external" href="https://github.com/HLT-ISTI/QuaPy/wiki/Evaluation">evaluation</a> and
|
||
<a class="reference external" href="https://github.com/HLT-ISTI/QuaPy/wiki/Protocols">protocols</a>).</p>
|
||
<section id="reviews-datasets">
|
||
<h2>Reviews Datasets<a class="headerlink" href="#reviews-datasets" title="Permalink to this heading">¶</a></h2>
|
||
<p>Three datasets of reviews about Kindle devices, Harry Potter’s series, and
|
||
the well-known IMDb movie reviews can be fetched using a unified interface.
|
||
For example:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">'kindle'</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>These datasets have been used in:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Esuli</span><span class="p">,</span> <span class="n">A</span><span class="o">.</span><span class="p">,</span> <span class="n">Moreo</span><span class="p">,</span> <span class="n">A</span><span class="o">.</span><span class="p">,</span> <span class="o">&</span> <span class="n">Sebastiani</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span> <span class="p">(</span><span class="mi">2018</span><span class="p">,</span> <span class="n">October</span><span class="p">)</span><span class="o">.</span>
|
||
<span class="n">A</span> <span class="n">recurrent</span> <span class="n">neural</span> <span class="n">network</span> <span class="k">for</span> <span class="n">sentiment</span> <span class="n">quantification</span><span class="o">.</span>
|
||
<span class="n">In</span> <span class="n">Proceedings</span> <span class="n">of</span> <span class="n">the</span> <span class="mi">27</span><span class="n">th</span> <span class="n">ACM</span> <span class="n">International</span> <span class="n">Conference</span> <span class="n">on</span>
|
||
<span class="n">Information</span> <span class="ow">and</span> <span class="n">Knowledge</span> <span class="n">Management</span> <span class="p">(</span><span class="n">pp</span><span class="o">.</span> <span class="mi">1775</span><span class="o">-</span><span class="mi">1778</span><span class="p">)</span><span class="o">.</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The list of reviews ids is available in:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">REVIEWS_SENTIMENT_DATASETS</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Some statistics of the fhe available datasets are summarized below:</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p>Dataset</p></th>
|
||
<th class="head text-center"><p>classes</p></th>
|
||
<th class="head text-center"><p>train size</p></th>
|
||
<th class="head text-center"><p>test size</p></th>
|
||
<th class="head text-center"><p>train prev</p></th>
|
||
<th class="head text-center"><p>test prev</p></th>
|
||
<th class="head"><p>type</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>hp</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>9533</p></td>
|
||
<td class="text-center"><p>18399</p></td>
|
||
<td class="text-center"><p>[0.018, 0.982]</p></td>
|
||
<td class="text-center"><p>[0.065, 0.935]</p></td>
|
||
<td><p>text</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>kindle</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>3821</p></td>
|
||
<td class="text-center"><p>21591</p></td>
|
||
<td class="text-center"><p>[0.081, 0.919]</p></td>
|
||
<td class="text-center"><p>[0.063, 0.937]</p></td>
|
||
<td><p>text</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>imdb</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>25000</p></td>
|
||
<td class="text-center"><p>25000</p></td>
|
||
<td class="text-center"><p>[0.500, 0.500]</p></td>
|
||
<td class="text-center"><p>[0.500, 0.500]</p></td>
|
||
<td><p>text</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="twitter-sentiment-datasets">
|
||
<h2>Twitter Sentiment Datasets<a class="headerlink" href="#twitter-sentiment-datasets" title="Permalink to this heading">¶</a></h2>
|
||
<p>11 Twitter datasets for sentiment analysis.
|
||
Text is not accessible, and the documents were made available
|
||
in tf-idf format. Each dataset presents two splits: a train/val
|
||
split for model selection purposes, and a train+val/test split
|
||
for model evaluation. The following code exemplifies how to load
|
||
a twitter dataset for model selection.</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_twitter</span><span class="p">(</span><span class="s1">'gasp'</span><span class="p">,</span> <span class="n">for_model_selection</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The datasets were used in:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Gao</span><span class="p">,</span> <span class="n">W</span><span class="o">.</span><span class="p">,</span> <span class="o">&</span> <span class="n">Sebastiani</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span> <span class="p">(</span><span class="mi">2015</span><span class="p">,</span> <span class="n">August</span><span class="p">)</span><span class="o">.</span>
|
||
<span class="n">Tweet</span> <span class="n">sentiment</span><span class="p">:</span> <span class="n">From</span> <span class="n">classification</span> <span class="n">to</span> <span class="n">quantification</span><span class="o">.</span>
|
||
<span class="n">In</span> <span class="mi">2015</span> <span class="n">IEEE</span><span class="o">/</span><span class="n">ACM</span> <span class="n">International</span> <span class="n">Conference</span> <span class="n">on</span> <span class="n">Advances</span> <span class="ow">in</span>
|
||
<span class="n">Social</span> <span class="n">Networks</span> <span class="n">Analysis</span> <span class="ow">and</span> <span class="n">Mining</span> <span class="p">(</span><span class="n">ASONAM</span><span class="p">)</span> <span class="p">(</span><span class="n">pp</span><span class="o">.</span> <span class="mi">97</span><span class="o">-</span><span class="mi">104</span><span class="p">)</span><span class="o">.</span> <span class="n">IEEE</span><span class="o">.</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Three of the datasets (semeval13, semeval14, and semeval15) share the
|
||
same training set (semeval), meaning that the training split one would get
|
||
when requesting any of them is the same. The dataset “semeval” can only
|
||
be requested with “for_model_selection=True”.
|
||
The lists of the Twitter dataset’s ids can be consulted in:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># a list of 11 dataset ids that can be used for model selection or model evaluation</span>
|
||
<span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">TWITTER_SENTIMENT_DATASETS_TEST</span>
|
||
|
||
<span class="c1"># 9 dataset ids in which "semeval13", "semeval14", and "semeval15" are replaced with "semeval"</span>
|
||
<span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">TWITTER_SENTIMENT_DATASETS_TRAIN</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Some details can be found below:</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p>Dataset</p></th>
|
||
<th class="head text-center"><p>classes</p></th>
|
||
<th class="head text-center"><p>train size</p></th>
|
||
<th class="head text-center"><p>test size</p></th>
|
||
<th class="head text-center"><p>features</p></th>
|
||
<th class="head text-center"><p>train prev</p></th>
|
||
<th class="head text-center"><p>test prev</p></th>
|
||
<th class="head"><p>type</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>gasp</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>8788</p></td>
|
||
<td class="text-center"><p>3765</p></td>
|
||
<td class="text-center"><p>694582</p></td>
|
||
<td class="text-center"><p>[0.421, 0.496, 0.082]</p></td>
|
||
<td class="text-center"><p>[0.407, 0.507, 0.086]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>hcr</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>1594</p></td>
|
||
<td class="text-center"><p>798</p></td>
|
||
<td class="text-center"><p>222046</p></td>
|
||
<td class="text-center"><p>[0.546, 0.211, 0.243]</p></td>
|
||
<td class="text-center"><p>[0.640, 0.167, 0.193]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>omd</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>1839</p></td>
|
||
<td class="text-center"><p>787</p></td>
|
||
<td class="text-center"><p>199151</p></td>
|
||
<td class="text-center"><p>[0.463, 0.271, 0.266]</p></td>
|
||
<td class="text-center"><p>[0.437, 0.283, 0.280]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>sanders</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>2155</p></td>
|
||
<td class="text-center"><p>923</p></td>
|
||
<td class="text-center"><p>229399</p></td>
|
||
<td class="text-center"><p>[0.161, 0.691, 0.148]</p></td>
|
||
<td class="text-center"><p>[0.164, 0.688, 0.148]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>semeval13</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>11338</p></td>
|
||
<td class="text-center"><p>3813</p></td>
|
||
<td class="text-center"><p>1215742</p></td>
|
||
<td class="text-center"><p>[0.159, 0.470, 0.372]</p></td>
|
||
<td class="text-center"><p>[0.158, 0.430, 0.412]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>semeval14</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>11338</p></td>
|
||
<td class="text-center"><p>1853</p></td>
|
||
<td class="text-center"><p>1215742</p></td>
|
||
<td class="text-center"><p>[0.159, 0.470, 0.372]</p></td>
|
||
<td class="text-center"><p>[0.109, 0.361, 0.530]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>semeval15</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>11338</p></td>
|
||
<td class="text-center"><p>2390</p></td>
|
||
<td class="text-center"><p>1215742</p></td>
|
||
<td class="text-center"><p>[0.159, 0.470, 0.372]</p></td>
|
||
<td class="text-center"><p>[0.153, 0.413, 0.434]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>semeval16</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>8000</p></td>
|
||
<td class="text-center"><p>2000</p></td>
|
||
<td class="text-center"><p>889504</p></td>
|
||
<td class="text-center"><p>[0.157, 0.351, 0.492]</p></td>
|
||
<td class="text-center"><p>[0.163, 0.341, 0.497]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>sst</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>2971</p></td>
|
||
<td class="text-center"><p>1271</p></td>
|
||
<td class="text-center"><p>376132</p></td>
|
||
<td class="text-center"><p>[0.261, 0.452, 0.288]</p></td>
|
||
<td class="text-center"><p>[0.207, 0.481, 0.312]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>wa</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>2184</p></td>
|
||
<td class="text-center"><p>936</p></td>
|
||
<td class="text-center"><p>248563</p></td>
|
||
<td class="text-center"><p>[0.305, 0.414, 0.281]</p></td>
|
||
<td class="text-center"><p>[0.282, 0.446, 0.272]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>wb</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>4259</p></td>
|
||
<td class="text-center"><p>1823</p></td>
|
||
<td class="text-center"><p>404333</p></td>
|
||
<td class="text-center"><p>[0.270, 0.392, 0.337]</p></td>
|
||
<td class="text-center"><p>[0.274, 0.392, 0.335]</p></td>
|
||
<td><p>sparse</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="uci-machine-learning">
|
||
<h2>UCI Machine Learning<a class="headerlink" href="#uci-machine-learning" title="Permalink to this heading">¶</a></h2>
|
||
<p>A set of 32 datasets from the <a class="reference external" href="https://archive.ics.uci.edu/ml/datasets.php">UCI Machine Learning repository</a>
|
||
used in:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Pérez</span><span class="o">-</span><span class="n">Gállego</span><span class="p">,</span> <span class="n">P</span><span class="o">.</span><span class="p">,</span> <span class="n">Quevedo</span><span class="p">,</span> <span class="n">J</span><span class="o">.</span> <span class="n">R</span><span class="o">.</span><span class="p">,</span> <span class="o">&</span> <span class="k">del</span> <span class="n">Coz</span><span class="p">,</span> <span class="n">J</span><span class="o">.</span> <span class="n">J</span><span class="o">.</span> <span class="p">(</span><span class="mi">2017</span><span class="p">)</span><span class="o">.</span>
|
||
<span class="n">Using</span> <span class="n">ensembles</span> <span class="k">for</span> <span class="n">problems</span> <span class="k">with</span> <span class="n">characterizable</span> <span class="n">changes</span>
|
||
<span class="ow">in</span> <span class="n">data</span> <span class="n">distribution</span><span class="p">:</span> <span class="n">A</span> <span class="n">case</span> <span class="n">study</span> <span class="n">on</span> <span class="n">quantification</span><span class="o">.</span>
|
||
<span class="n">Information</span> <span class="n">Fusion</span><span class="p">,</span> <span class="mi">34</span><span class="p">,</span> <span class="mi">87</span><span class="o">-</span><span class="mf">100.</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The list does not exactly coincide with that used in Pérez-Gállego et al. 2017
|
||
since we were unable to find the datasets with ids “diabetes” and “phoneme”.</p>
|
||
<p>These dataset can be loaded by calling, e.g.:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCIDataset</span><span class="p">(</span><span class="s1">'yeast'</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>This call will return a <em>Dataset</em> object in which the training and
|
||
test splits are randomly drawn, in a stratified manner, from the whole
|
||
collection at 70% and 30%, respectively. The <em>verbose=True</em> option indicates
|
||
that the dataset description should be printed in standard output.
|
||
The original data is not split,
|
||
and some papers submit the entire collection to a kFCV validation.
|
||
In order to accommodate with these practices, one could first instantiate
|
||
the entire collection, and then creating a generator that will return one
|
||
training+test dataset at a time, following a kFCV protocol:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
|
||
<span class="n">collection</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCILabelledCollection</span><span class="p">(</span><span class="s2">"yeast"</span><span class="p">)</span>
|
||
<span class="k">for</span> <span class="n">data</span> <span class="ow">in</span> <span class="n">qp</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">Dataset</span><span class="o">.</span><span class="n">kFCV</span><span class="p">(</span><span class="n">collection</span><span class="p">,</span> <span class="n">nfolds</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">nrepeats</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
|
||
<span class="o">...</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Above code will allow to conduct a 2x5FCV evaluation on the “yeast” dataset.</p>
|
||
<p>All datasets come in numerical form (dense matrices); some statistics
|
||
are summarized below.</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p>Dataset</p></th>
|
||
<th class="head text-center"><p>classes</p></th>
|
||
<th class="head text-center"><p>instances</p></th>
|
||
<th class="head text-center"><p>features</p></th>
|
||
<th class="head text-center"><p>prev</p></th>
|
||
<th class="head"><p>type</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>acute.a</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>120</p></td>
|
||
<td class="text-center"><p>6</p></td>
|
||
<td class="text-center"><p>[0.508, 0.492]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>acute.b</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>120</p></td>
|
||
<td class="text-center"><p>6</p></td>
|
||
<td class="text-center"><p>[0.583, 0.417]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>balance.1</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>625</p></td>
|
||
<td class="text-center"><p>4</p></td>
|
||
<td class="text-center"><p>[0.539, 0.461]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>balance.2</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>625</p></td>
|
||
<td class="text-center"><p>4</p></td>
|
||
<td class="text-center"><p>[0.922, 0.078]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>balance.3</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>625</p></td>
|
||
<td class="text-center"><p>4</p></td>
|
||
<td class="text-center"><p>[0.539, 0.461]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>breast-cancer</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>683</p></td>
|
||
<td class="text-center"><p>9</p></td>
|
||
<td class="text-center"><p>[0.350, 0.650]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>cmc.1</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>1473</p></td>
|
||
<td class="text-center"><p>9</p></td>
|
||
<td class="text-center"><p>[0.573, 0.427]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>cmc.2</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>1473</p></td>
|
||
<td class="text-center"><p>9</p></td>
|
||
<td class="text-center"><p>[0.774, 0.226]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>cmc.3</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>1473</p></td>
|
||
<td class="text-center"><p>9</p></td>
|
||
<td class="text-center"><p>[0.653, 0.347]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>ctg.1</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>2126</p></td>
|
||
<td class="text-center"><p>22</p></td>
|
||
<td class="text-center"><p>[0.222, 0.778]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>ctg.2</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>2126</p></td>
|
||
<td class="text-center"><p>22</p></td>
|
||
<td class="text-center"><p>[0.861, 0.139]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>ctg.3</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>2126</p></td>
|
||
<td class="text-center"><p>22</p></td>
|
||
<td class="text-center"><p>[0.917, 0.083]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>german</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>1000</p></td>
|
||
<td class="text-center"><p>24</p></td>
|
||
<td class="text-center"><p>[0.300, 0.700]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>haberman</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>306</p></td>
|
||
<td class="text-center"><p>3</p></td>
|
||
<td class="text-center"><p>[0.735, 0.265]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>ionosphere</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>351</p></td>
|
||
<td class="text-center"><p>34</p></td>
|
||
<td class="text-center"><p>[0.641, 0.359]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>iris.1</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>150</p></td>
|
||
<td class="text-center"><p>4</p></td>
|
||
<td class="text-center"><p>[0.667, 0.333]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>iris.2</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>150</p></td>
|
||
<td class="text-center"><p>4</p></td>
|
||
<td class="text-center"><p>[0.667, 0.333]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>iris.3</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>150</p></td>
|
||
<td class="text-center"><p>4</p></td>
|
||
<td class="text-center"><p>[0.667, 0.333]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>mammographic</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>830</p></td>
|
||
<td class="text-center"><p>5</p></td>
|
||
<td class="text-center"><p>[0.514, 0.486]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>pageblocks.5</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>5473</p></td>
|
||
<td class="text-center"><p>10</p></td>
|
||
<td class="text-center"><p>[0.979, 0.021]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>semeion</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>1593</p></td>
|
||
<td class="text-center"><p>256</p></td>
|
||
<td class="text-center"><p>[0.901, 0.099]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>sonar</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>208</p></td>
|
||
<td class="text-center"><p>60</p></td>
|
||
<td class="text-center"><p>[0.534, 0.466]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>spambase</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>4601</p></td>
|
||
<td class="text-center"><p>57</p></td>
|
||
<td class="text-center"><p>[0.606, 0.394]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>spectf</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>267</p></td>
|
||
<td class="text-center"><p>44</p></td>
|
||
<td class="text-center"><p>[0.794, 0.206]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>tictactoe</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>958</p></td>
|
||
<td class="text-center"><p>9</p></td>
|
||
<td class="text-center"><p>[0.653, 0.347]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>transfusion</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>748</p></td>
|
||
<td class="text-center"><p>4</p></td>
|
||
<td class="text-center"><p>[0.762, 0.238]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>wdbc</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>569</p></td>
|
||
<td class="text-center"><p>30</p></td>
|
||
<td class="text-center"><p>[0.627, 0.373]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>wine.1</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>178</p></td>
|
||
<td class="text-center"><p>13</p></td>
|
||
<td class="text-center"><p>[0.669, 0.331]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>wine.2</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>178</p></td>
|
||
<td class="text-center"><p>13</p></td>
|
||
<td class="text-center"><p>[0.601, 0.399]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>wine.3</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>178</p></td>
|
||
<td class="text-center"><p>13</p></td>
|
||
<td class="text-center"><p>[0.730, 0.270]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>wine-q-red</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>1599</p></td>
|
||
<td class="text-center"><p>11</p></td>
|
||
<td class="text-center"><p>[0.465, 0.535]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>wine-q-white</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>4898</p></td>
|
||
<td class="text-center"><p>11</p></td>
|
||
<td class="text-center"><p>[0.335, 0.665]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>yeast</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>1484</p></td>
|
||
<td class="text-center"><p>8</p></td>
|
||
<td class="text-center"><p>[0.711, 0.289]</p></td>
|
||
<td><p>dense</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<section id="issues">
|
||
<h3>Issues:<a class="headerlink" href="#issues" title="Permalink to this heading">¶</a></h3>
|
||
<p>All datasets will be downloaded automatically the first time they are requested, and
|
||
stored in the <em>quapy_data</em> folder for faster further reuse.
|
||
However, some datasets require special actions that at the moment are not fully
|
||
automated.</p>
|
||
<ul class="simple">
|
||
<li><p>Datasets with ids “ctg.1”, “ctg.2”, and “ctg.3” (<em>Cardiotocography Data Set</em>) load
|
||
an Excel file, which requires the user to install the <em>xlrd</em> Python module in order
|
||
to open it.</p></li>
|
||
<li><p>The dataset with id “pageblocks.5” (<em>Page Blocks Classification (5)</em>) needs to
|
||
open a “unix compressed file” (extension .Z), which is not directly doable with
|
||
standard Pythons packages like gzip or zip. This file would need to be uncompressed using
|
||
OS-dependent software manually. Information on how to do it will be printed the first
|
||
time the dataset is invoked.</p></li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
<section id="lequa-datasets">
|
||
<h2>LeQua Datasets<a class="headerlink" href="#lequa-datasets" title="Permalink to this heading">¶</a></h2>
|
||
<p>QuaPy also provides the datasets used for the LeQua competition.
|
||
In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
|
||
problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide
|
||
raw documents instead.
|
||
Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B
|
||
are multiclass quantification problems consisting of estimating the class prevalence
|
||
values of 28 different merchandise products.</p>
|
||
<p>Every task consists of a training set, a set of validation samples (for model selection)
|
||
and a set of test samples (for evaluation). QuaPy returns this data as a LabelledCollection
|
||
(training) and two generation protocols (for validation and test samples), as follows:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">training</span><span class="p">,</span> <span class="n">val_generator</span><span class="p">,</span> <span class="n">test_generator</span> <span class="o">=</span> <span class="n">fetch_lequa2022</span><span class="p">(</span><span class="n">task</span><span class="o">=</span><span class="n">task</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>See the <code class="docutils literal notranslate"><span class="pre">lequa2022_experiments.py</span></code> in the examples folder for further details on how to
|
||
carry out experiments using these datasets.</p>
|
||
<p>The datasets are downloaded only once, and stored for fast reuse.</p>
|
||
<p>Some statistics are summarized below:</p>
|
||
<table class="docutils align-default">
|
||
<thead>
|
||
<tr class="row-odd"><th class="head"><p>Dataset</p></th>
|
||
<th class="head text-center"><p>classes</p></th>
|
||
<th class="head text-center"><p>train size</p></th>
|
||
<th class="head text-center"><p>validation samples</p></th>
|
||
<th class="head text-center"><p>test samples</p></th>
|
||
<th class="head text-center"><p>docs by sample</p></th>
|
||
<th class="head text-center"><p>type</p></th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="row-even"><td><p>T1A</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>5000</p></td>
|
||
<td class="text-center"><p>1000</p></td>
|
||
<td class="text-center"><p>5000</p></td>
|
||
<td class="text-center"><p>250</p></td>
|
||
<td class="text-center"><p>vector</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>T1B</p></td>
|
||
<td class="text-center"><p>28</p></td>
|
||
<td class="text-center"><p>20000</p></td>
|
||
<td class="text-center"><p>1000</p></td>
|
||
<td class="text-center"><p>5000</p></td>
|
||
<td class="text-center"><p>1000</p></td>
|
||
<td class="text-center"><p>vector</p></td>
|
||
</tr>
|
||
<tr class="row-even"><td><p>T2A</p></td>
|
||
<td class="text-center"><p>2</p></td>
|
||
<td class="text-center"><p>5000</p></td>
|
||
<td class="text-center"><p>1000</p></td>
|
||
<td class="text-center"><p>5000</p></td>
|
||
<td class="text-center"><p>250</p></td>
|
||
<td class="text-center"><p>text</p></td>
|
||
</tr>
|
||
<tr class="row-odd"><td><p>T2B</p></td>
|
||
<td class="text-center"><p>28</p></td>
|
||
<td class="text-center"><p>20000</p></td>
|
||
<td class="text-center"><p>1000</p></td>
|
||
<td class="text-center"><p>5000</p></td>
|
||
<td class="text-center"><p>1000</p></td>
|
||
<td class="text-center"><p>text</p></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>For further details on the datasets, we refer to the original
|
||
<a class="reference external" href="https://ceur-ws.org/Vol-3180/paper-146.pdf">paper</a>:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Esuli</span><span class="p">,</span> <span class="n">A</span><span class="o">.</span><span class="p">,</span> <span class="n">Moreo</span><span class="p">,</span> <span class="n">A</span><span class="o">.</span><span class="p">,</span> <span class="n">Sebastiani</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="p">,</span> <span class="o">&</span> <span class="n">Sperduti</span><span class="p">,</span> <span class="n">G</span><span class="o">.</span> <span class="p">(</span><span class="mi">2022</span><span class="p">)</span><span class="o">.</span>
|
||
<span class="n">A</span> <span class="n">Detailed</span> <span class="n">Overview</span> <span class="n">of</span> <span class="n">LeQua</span><span class="o">@</span> <span class="n">CLEF</span> <span class="mi">2022</span><span class="p">:</span> <span class="n">Learning</span> <span class="n">to</span> <span class="n">Quantify</span><span class="o">.</span>
|
||
</pre></div>
|
||
</div>
|
||
</section>
|
||
<section id="adding-custom-datasets">
|
||
<h2>Adding Custom Datasets<a class="headerlink" href="#adding-custom-datasets" title="Permalink to this heading">¶</a></h2>
|
||
<p>QuaPy provides data loaders for simple formats dealing with
|
||
text, following the format:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">class</span><span class="o">-</span><span class="nb">id</span> \<span class="n">t</span> <span class="n">first</span> <span class="n">document</span><span class="s1">'s pre-processed text </span><span class="se">\n</span>
|
||
<span class="n">class</span><span class="o">-</span><span class="nb">id</span> \<span class="n">t</span> <span class="n">second</span> <span class="n">document</span><span class="s1">'s pre-processed text </span><span class="se">\n</span>
|
||
<span class="o">...</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>and sparse representations of the form:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">{</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="ow">or</span> <span class="o">+</span><span class="mi">1</span><span class="p">}</span> <span class="n">col</span><span class="p">(</span><span class="nb">int</span><span class="p">):</span><span class="n">val</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span> <span class="n">col</span><span class="p">(</span><span class="nb">int</span><span class="p">):</span><span class="n">val</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span> <span class="o">...</span> \<span class="n">n</span>
|
||
<span class="o">...</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The code in charge in loading a LabelledCollection is:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@classmethod</span>
|
||
<span class="k">def</span> <span class="nf">load</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span><span class="nb">str</span><span class="p">,</span> <span class="n">loader_func</span><span class="p">:</span><span class="n">callable</span><span class="p">):</span>
|
||
<span class="k">return</span> <span class="n">LabelledCollection</span><span class="p">(</span><span class="o">*</span><span class="n">loader_func</span><span class="p">(</span><span class="n">path</span><span class="p">))</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>indicating that any <em>loader_func</em> (e.g., a user-defined one) which
|
||
returns valid arguments for initializing a <em>LabelledCollection</em> object will allow
|
||
to load any collection. In particular, the <em>LabelledCollection</em> receives as
|
||
arguments the instances (as an iterable) and the labels (as an iterable) and,
|
||
additionally, the number of classes can be specified (it would otherwise be
|
||
inferred from the labels, but that requires at least one positive example for
|
||
all classes to be present in the collection).</p>
|
||
<p>The same <em>loader_func</em> can be passed to a Dataset, along with two
|
||
paths, in order to create a training and test pair of <em>LabelledCollection</em>,
|
||
e.g.:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
|
||
|
||
<span class="n">train_path</span> <span class="o">=</span> <span class="s1">'../my_data/train.dat'</span>
|
||
<span class="n">test_path</span> <span class="o">=</span> <span class="s1">'../my_data/test.dat'</span>
|
||
|
||
<span class="k">def</span> <span class="nf">my_custom_loader</span><span class="p">(</span><span class="n">path</span><span class="p">):</span>
|
||
<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="s1">'rb'</span><span class="p">)</span> <span class="k">as</span> <span class="n">fin</span><span class="p">:</span>
|
||
<span class="o">...</span>
|
||
<span class="k">return</span> <span class="n">instances</span><span class="p">,</span> <span class="n">labels</span>
|
||
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">Dataset</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">train_path</span><span class="p">,</span> <span class="n">test_path</span><span class="p">,</span> <span class="n">my_custom_loader</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<section id="data-processing">
|
||
<h3>Data Processing<a class="headerlink" href="#data-processing" title="Permalink to this heading">¶</a></h3>
|
||
<p>QuaPy implements a number of preprocessing functions in the package <em>qp.data.preprocessing</em>, including:</p>
|
||
<ul class="simple">
|
||
<li><p><em>text2tfidf</em>: tfidf vectorization</p></li>
|
||
<li><p><em>reduce_columns</em>: reducing the number of columns based on term frequency</p></li>
|
||
<li><p><em>standardize</em>: transforms the column values into z-scores (i.e., subtract the mean and normalizes by the standard deviation, so
|
||
that the column values have zero mean and unit variance).</p></li>
|
||
<li><p><em>index</em>: transforms textual tokens into lists of numeric ids)</p></li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
<div class="clearer"></div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
|
||
<div class="sphinxsidebarwrapper">
|
||
<div>
|
||
<h3><a href="index.html">Table of Contents</a></h3>
|
||
<ul>
|
||
<li><a class="reference internal" href="#">Datasets</a><ul>
|
||
<li><a class="reference internal" href="#reviews-datasets">Reviews Datasets</a></li>
|
||
<li><a class="reference internal" href="#twitter-sentiment-datasets">Twitter Sentiment Datasets</a></li>
|
||
<li><a class="reference internal" href="#uci-machine-learning">UCI Machine Learning</a><ul>
|
||
<li><a class="reference internal" href="#issues">Issues:</a></li>
|
||
</ul>
|
||
</li>
|
||
<li><a class="reference internal" href="#lequa-datasets">LeQua Datasets</a></li>
|
||
<li><a class="reference internal" href="#adding-custom-datasets">Adding Custom Datasets</a><ul>
|
||
<li><a class="reference internal" href="#data-processing">Data Processing</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
|
||
</div>
|
||
<div>
|
||
<h4>Previous topic</h4>
|
||
<p class="topless"><a href="Installation.html"
|
||
title="previous chapter">Installation</a></p>
|
||
</div>
|
||
<div>
|
||
<h4>Next topic</h4>
|
||
<p class="topless"><a href="Evaluation.html"
|
||
title="next chapter">Evaluation</a></p>
|
||
</div>
|
||
<div role="note" aria-label="source link">
|
||
<h3>This Page</h3>
|
||
<ul class="this-page-menu">
|
||
<li><a href="_sources/Datasets.md.txt"
|
||
rel="nofollow">Show Source</a></li>
|
||
</ul>
|
||
</div>
|
||
<div id="searchbox" style="display: none" role="search">
|
||
<h3 id="searchlabel">Quick search</h3>
|
||
<div class="searchformwrapper">
|
||
<form class="search" action="search.html" method="get">
|
||
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
|
||
<input type="submit" value="Go" />
|
||
</form>
|
||
</div>
|
||
</div>
|
||
<script>document.getElementById('searchbox').style.display = "block"</script>
|
||
</div>
|
||
</div>
|
||
<div class="clearer"></div>
|
||
</div>
|
||
<div class="related" role="navigation" aria-label="related navigation">
|
||
<h3>Navigation</h3>
|
||
<ul>
|
||
<li class="right" style="margin-right: 10px">
|
||
<a href="genindex.html" title="General Index"
|
||
>index</a></li>
|
||
<li class="right" >
|
||
<a href="py-modindex.html" title="Python Module Index"
|
||
>modules</a> |</li>
|
||
<li class="right" >
|
||
<a href="Evaluation.html" title="Evaluation"
|
||
>next</a> |</li>
|
||
<li class="right" >
|
||
<a href="Installation.html" title="Installation"
|
||
>previous</a> |</li>
|
||
<li class="nav-item nav-item-0"><a href="index.html">QuaPy 0.1.7 documentation</a> »</li>
|
||
<li class="nav-item nav-item-this"><a href="">Datasets</a></li>
|
||
</ul>
|
||
</div>
|
||
<div class="footer" role="contentinfo">
|
||
© Copyright 2021, Alejandro Moreo.
|
||
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 5.3.0.
|
||
</div>
|
||
</body>
|
||
</html> |