945 lines
142 KiB
HTML
945 lines
142 KiB
HTML
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" data-content_root="../../../">
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>quapy.data.datasets — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=92fd9be5" />
|
||
<link rel="stylesheet" type="text/css" href="../../../_static/css/theme.css?v=19f00094" />
|
||
|
||
|
||
<!--[if lt IE 9]>
|
||
<script src="../../../_static/js/html5shiv.min.js"></script>
|
||
<![endif]-->
|
||
|
||
<script src="../../../_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="../../../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script src="../../../_static/documentation_options.js?v=22607128"></script>
|
||
<script src="../../../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../../../_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="../../../genindex.html" />
|
||
<link rel="search" title="Search" href="../../../search.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="../../../index.html" class="icon icon-home">
|
||
QuaPy: A Python-based open-source framework for quantification
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../../../modules.html">quapy</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="../../../index.html">QuaPy: A Python-based open-source framework for quantification</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="../../../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item"><a href="../../index.html">Module code</a></li>
|
||
<li class="breadcrumb-item active">quapy.data.datasets</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<h1>Source code for quapy.data.datasets</h1><div class="highlight"><pre>
|
||
<div class="viewcode-block" id="warn">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.warn">[docs]</a>
|
||
<span></span><span class="k">def</span> <span class="nf">warn</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
|
||
<span class="k">pass</span></div>
|
||
|
||
<span class="kn">import</span> <span class="nn">warnings</span>
|
||
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span> <span class="o">=</span> <span class="n">warn</span>
|
||
<span class="kn">import</span> <span class="nn">os</span>
|
||
<span class="kn">import</span> <span class="nn">zipfile</span>
|
||
<span class="kn">from</span> <span class="nn">os.path</span> <span class="kn">import</span> <span class="n">join</span>
|
||
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
|
||
<span class="kn">from</span> <span class="nn">ucimlrepo</span> <span class="kn">import</span> <span class="n">fetch_ucirepo</span>
|
||
<span class="kn">from</span> <span class="nn">quapy.data.base</span> <span class="kn">import</span> <span class="n">Dataset</span><span class="p">,</span> <span class="n">LabelledCollection</span>
|
||
<span class="kn">from</span> <span class="nn">quapy.data.preprocessing</span> <span class="kn">import</span> <span class="n">text2tfidf</span><span class="p">,</span> <span class="n">reduce_columns</span>
|
||
<span class="kn">from</span> <span class="nn">quapy.data.reader</span> <span class="kn">import</span> <span class="o">*</span>
|
||
<span class="kn">from</span> <span class="nn">quapy.util</span> <span class="kn">import</span> <span class="n">download_file_if_not_exists</span><span class="p">,</span> <span class="n">download_file</span><span class="p">,</span> <span class="n">get_quapy_home</span><span class="p">,</span> <span class="n">pickled_resource</span>
|
||
|
||
|
||
<span class="n">REVIEWS_SENTIMENT_DATASETS</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'hp'</span><span class="p">,</span> <span class="s1">'kindle'</span><span class="p">,</span> <span class="s1">'imdb'</span><span class="p">]</span>
|
||
<span class="n">TWITTER_SENTIMENT_DATASETS_TEST</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'gasp'</span><span class="p">,</span> <span class="s1">'hcr'</span><span class="p">,</span> <span class="s1">'omd'</span><span class="p">,</span> <span class="s1">'sanders'</span><span class="p">,</span>
|
||
<span class="s1">'semeval13'</span><span class="p">,</span> <span class="s1">'semeval14'</span><span class="p">,</span> <span class="s1">'semeval15'</span><span class="p">,</span> <span class="s1">'semeval16'</span><span class="p">,</span>
|
||
<span class="s1">'sst'</span><span class="p">,</span> <span class="s1">'wa'</span><span class="p">,</span> <span class="s1">'wb'</span><span class="p">]</span>
|
||
<span class="n">TWITTER_SENTIMENT_DATASETS_TRAIN</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'gasp'</span><span class="p">,</span> <span class="s1">'hcr'</span><span class="p">,</span> <span class="s1">'omd'</span><span class="p">,</span> <span class="s1">'sanders'</span><span class="p">,</span>
|
||
<span class="s1">'semeval'</span><span class="p">,</span> <span class="s1">'semeval16'</span><span class="p">,</span>
|
||
<span class="s1">'sst'</span><span class="p">,</span> <span class="s1">'wa'</span><span class="p">,</span> <span class="s1">'wb'</span><span class="p">]</span>
|
||
<span class="n">UCI_BINARY_DATASETS</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'acute.a'</span><span class="p">,</span> <span class="s1">'acute.b'</span><span class="p">,</span>
|
||
<span class="s1">'balance.1'</span><span class="p">,</span> <span class="s1">'balance.2'</span><span class="p">,</span> <span class="s1">'balance.3'</span><span class="p">,</span>
|
||
<span class="s1">'breast-cancer'</span><span class="p">,</span>
|
||
<span class="s1">'cmc.1'</span><span class="p">,</span> <span class="s1">'cmc.2'</span><span class="p">,</span> <span class="s1">'cmc.3'</span><span class="p">,</span>
|
||
<span class="s1">'ctg.1'</span><span class="p">,</span> <span class="s1">'ctg.2'</span><span class="p">,</span> <span class="s1">'ctg.3'</span><span class="p">,</span>
|
||
<span class="c1">#'diabetes', # <-- I haven't found this one...</span>
|
||
<span class="s1">'german'</span><span class="p">,</span>
|
||
<span class="s1">'haberman'</span><span class="p">,</span>
|
||
<span class="s1">'ionosphere'</span><span class="p">,</span>
|
||
<span class="s1">'iris.1'</span><span class="p">,</span> <span class="s1">'iris.2'</span><span class="p">,</span> <span class="s1">'iris.3'</span><span class="p">,</span>
|
||
<span class="s1">'mammographic'</span><span class="p">,</span>
|
||
<span class="s1">'pageblocks.5'</span><span class="p">,</span>
|
||
<span class="c1">#'phoneme', # <-- I haven't found this one...</span>
|
||
<span class="s1">'semeion'</span><span class="p">,</span>
|
||
<span class="s1">'sonar'</span><span class="p">,</span>
|
||
<span class="s1">'spambase'</span><span class="p">,</span>
|
||
<span class="s1">'spectf'</span><span class="p">,</span>
|
||
<span class="s1">'tictactoe'</span><span class="p">,</span>
|
||
<span class="s1">'transfusion'</span><span class="p">,</span>
|
||
<span class="s1">'wdbc'</span><span class="p">,</span>
|
||
<span class="s1">'wine.1'</span><span class="p">,</span> <span class="s1">'wine.2'</span><span class="p">,</span> <span class="s1">'wine.3'</span><span class="p">,</span>
|
||
<span class="s1">'wine-q-red'</span><span class="p">,</span> <span class="s1">'wine-q-white'</span><span class="p">,</span>
|
||
<span class="s1">'yeast'</span><span class="p">]</span>
|
||
|
||
<span class="n">UCI_MULTICLASS_DATASETS</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'dry-bean'</span><span class="p">,</span>
|
||
<span class="s1">'wine-quality'</span><span class="p">,</span>
|
||
<span class="s1">'academic-success'</span><span class="p">,</span>
|
||
<span class="s1">'digits'</span><span class="p">,</span>
|
||
<span class="s1">'letter'</span><span class="p">]</span>
|
||
|
||
<span class="n">LEQUA2022_TASKS</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'T1A'</span><span class="p">,</span> <span class="s1">'T1B'</span><span class="p">,</span> <span class="s1">'T2A'</span><span class="p">,</span> <span class="s1">'T2B'</span><span class="p">]</span>
|
||
|
||
<span class="n">_TXA_SAMPLE_SIZE</span> <span class="o">=</span> <span class="mi">250</span>
|
||
<span class="n">_TXB_SAMPLE_SIZE</span> <span class="o">=</span> <span class="mi">1000</span>
|
||
|
||
<span class="n">LEQUA2022_SAMPLE_SIZE</span> <span class="o">=</span> <span class="p">{</span>
|
||
<span class="s1">'TXA'</span><span class="p">:</span> <span class="n">_TXA_SAMPLE_SIZE</span><span class="p">,</span>
|
||
<span class="s1">'TXB'</span><span class="p">:</span> <span class="n">_TXB_SAMPLE_SIZE</span><span class="p">,</span>
|
||
<span class="s1">'T1A'</span><span class="p">:</span> <span class="n">_TXA_SAMPLE_SIZE</span><span class="p">,</span>
|
||
<span class="s1">'T1B'</span><span class="p">:</span> <span class="n">_TXB_SAMPLE_SIZE</span><span class="p">,</span>
|
||
<span class="s1">'T2A'</span><span class="p">:</span> <span class="n">_TXA_SAMPLE_SIZE</span><span class="p">,</span>
|
||
<span class="s1">'T2B'</span><span class="p">:</span> <span class="n">_TXB_SAMPLE_SIZE</span><span class="p">,</span>
|
||
<span class="s1">'binary'</span><span class="p">:</span> <span class="n">_TXA_SAMPLE_SIZE</span><span class="p">,</span>
|
||
<span class="s1">'multiclass'</span><span class="p">:</span> <span class="n">_TXB_SAMPLE_SIZE</span>
|
||
<span class="p">}</span>
|
||
|
||
|
||
<div class="viewcode-block" id="fetch_reviews">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.fetch_reviews">[docs]</a>
|
||
<span class="k">def</span> <span class="nf">fetch_reviews</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_home</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">pickle</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dataset</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Loads a Reviews dataset as a Dataset instance, as used in</span>
|
||
<span class="sd"> `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."</span>
|
||
<span class="sd"> Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.</span>
|
||
<span class="sd"> The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`</span>
|
||
|
||
<span class="sd"> :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'</span>
|
||
<span class="sd"> :param tfidf: set to True to transform the raw documents into tfidf weighted matrices</span>
|
||
<span class="sd"> :param min_df: minimun number of documents that should contain a term in order for the term to be</span>
|
||
<span class="sd"> kept (ignored if tfidf==False)</span>
|
||
<span class="sd"> :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default</span>
|
||
<span class="sd"> ~/quay_data/ directory)</span>
|
||
<span class="sd"> :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for</span>
|
||
<span class="sd"> faster subsequent invokations</span>
|
||
<span class="sd"> :return: a :class:`quapy.data.base.Dataset` instance</span>
|
||
<span class="sd"> """</span>
|
||
<span class="k">assert</span> <span class="n">dataset_name</span> <span class="ow">in</span> <span class="n">REVIEWS_SENTIMENT_DATASETS</span><span class="p">,</span> \
|
||
<span class="sa">f</span><span class="s1">'Name </span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1"> does not match any known dataset for sentiment reviews. '</span> \
|
||
<span class="sa">f</span><span class="s1">'Valid ones are </span><span class="si">{</span><span class="n">REVIEWS_SENTIMENT_DATASETS</span><span class="si">}</span><span class="s1">'</span>
|
||
<span class="k">if</span> <span class="n">data_home</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||
<span class="n">data_home</span> <span class="o">=</span> <span class="n">get_quapy_home</span><span class="p">()</span>
|
||
|
||
<span class="n">URL_TRAIN</span> <span class="o">=</span> <span class="sa">f</span><span class="s1">'https://zenodo.org/record/4117827/files/</span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1">_train.txt'</span>
|
||
<span class="n">URL_TEST</span> <span class="o">=</span> <span class="sa">f</span><span class="s1">'https://zenodo.org/record/4117827/files/</span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1">_test.txt'</span>
|
||
<span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'reviews'</span><span class="p">),</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
<span class="n">train_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'reviews'</span><span class="p">,</span> <span class="n">dataset_name</span><span class="p">,</span> <span class="s1">'train.txt'</span><span class="p">)</span>
|
||
<span class="n">test_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'reviews'</span><span class="p">,</span> <span class="n">dataset_name</span><span class="p">,</span> <span class="s1">'test.txt'</span><span class="p">)</span>
|
||
<span class="n">download_file_if_not_exists</span><span class="p">(</span><span class="n">URL_TRAIN</span><span class="p">,</span> <span class="n">train_path</span><span class="p">)</span>
|
||
<span class="n">download_file_if_not_exists</span><span class="p">(</span><span class="n">URL_TEST</span><span class="p">,</span> <span class="n">test_path</span><span class="p">)</span>
|
||
|
||
<span class="n">pickle_path</span> <span class="o">=</span> <span class="kc">None</span>
|
||
<span class="k">if</span> <span class="n">pickle</span><span class="p">:</span>
|
||
<span class="n">pickle_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'reviews'</span><span class="p">,</span> <span class="s1">'pickle'</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1">.pkl'</span><span class="p">)</span>
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">pickled_resource</span><span class="p">(</span><span class="n">pickle_path</span><span class="p">,</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">load</span><span class="p">,</span> <span class="n">train_path</span><span class="p">,</span> <span class="n">test_path</span><span class="p">,</span> <span class="n">from_text</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">tfidf</span><span class="p">:</span>
|
||
<span class="n">text2tfidf</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="n">min_df</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||
<span class="n">reduce_columns</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="n">min_df</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
|
||
<span class="n">data</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">dataset_name</span>
|
||
|
||
<span class="k">return</span> <span class="n">data</span></div>
|
||
|
||
|
||
|
||
<div class="viewcode-block" id="fetch_twitter">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.fetch_twitter">[docs]</a>
|
||
<span class="k">def</span> <span class="nf">fetch_twitter</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">,</span> <span class="n">for_model_selection</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_home</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">pickle</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dataset</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:</span>
|
||
<span class="sd"> `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.</span>
|
||
<span class="sd"> Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_</span>
|
||
<span class="sd"> Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.</span>
|
||
<span class="sd"> The list of valid dataset names corresponding to training sets can be accessed in</span>
|
||
<span class="sd"> `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in</span>
|
||
<span class="sd"> `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`</span>
|
||
|
||
<span class="sd"> :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',</span>
|
||
<span class="sd"> 'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'</span>
|
||
<span class="sd"> :param for_model_selection: if True, then returns the train split as the training set and the devel split</span>
|
||
<span class="sd"> as the test set; if False, then returns the train+devel split as the training set and the test set as the</span>
|
||
<span class="sd"> test set</span>
|
||
<span class="sd"> :param min_df: minimun number of documents that should contain a term in order for the term to be kept</span>
|
||
<span class="sd"> :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default</span>
|
||
<span class="sd"> ~/quay_data/ directory)</span>
|
||
<span class="sd"> :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for</span>
|
||
<span class="sd"> faster subsequent invokations</span>
|
||
<span class="sd"> :return: a :class:`quapy.data.base.Dataset` instance</span>
|
||
<span class="sd"> """</span>
|
||
<span class="k">assert</span> <span class="n">dataset_name</span> <span class="ow">in</span> <span class="n">TWITTER_SENTIMENT_DATASETS_TRAIN</span> <span class="o">+</span> <span class="n">TWITTER_SENTIMENT_DATASETS_TEST</span><span class="p">,</span> \
|
||
<span class="sa">f</span><span class="s1">'Name </span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1"> does not match any known dataset for sentiment twitter. '</span> \
|
||
<span class="sa">f</span><span class="s1">'Valid ones are </span><span class="si">{</span><span class="n">TWITTER_SENTIMENT_DATASETS_TRAIN</span><span class="si">}</span><span class="s1"> for model selection and '</span> \
|
||
<span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">TWITTER_SENTIMENT_DATASETS_TEST</span><span class="si">}</span><span class="s1"> for test (datasets "semeval14", "semeval15", "semeval16" share '</span> \
|
||
<span class="sa">f</span><span class="s1">'a common training set "semeval")'</span>
|
||
<span class="k">if</span> <span class="n">data_home</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||
<span class="n">data_home</span> <span class="o">=</span> <span class="n">get_quapy_home</span><span class="p">()</span>
|
||
|
||
<span class="n">URL</span> <span class="o">=</span> <span class="s1">'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'</span>
|
||
<span class="n">unzipped_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'tweet_sentiment_quantification_snam'</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">):</span>
|
||
<span class="n">downloaded_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'tweet_sentiment_quantification_snam.zip'</span><span class="p">)</span>
|
||
<span class="n">download_file</span><span class="p">(</span><span class="n">URL</span><span class="p">,</span> <span class="n">downloaded_path</span><span class="p">)</span>
|
||
<span class="k">with</span> <span class="n">zipfile</span><span class="o">.</span><span class="n">ZipFile</span><span class="p">(</span><span class="n">downloaded_path</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
|
||
<span class="n">file</span><span class="o">.</span><span class="n">extractall</span><span class="p">(</span><span class="n">data_home</span><span class="p">)</span>
|
||
<span class="n">os</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">downloaded_path</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="ow">in</span> <span class="p">{</span><span class="s1">'semeval13'</span><span class="p">,</span> <span class="s1">'semeval14'</span><span class="p">,</span> <span class="s1">'semeval15'</span><span class="p">}:</span>
|
||
<span class="n">trainset_name</span> <span class="o">=</span> <span class="s1">'semeval'</span>
|
||
<span class="n">testset_name</span> <span class="o">=</span> <span class="s1">'semeval'</span> <span class="k">if</span> <span class="n">for_model_selection</span> <span class="k">else</span> <span class="n">dataset_name</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "</span>
|
||
<span class="sa">f</span><span class="s2">"(called 'semeval'); returning trainin-set='</span><span class="si">{</span><span class="n">trainset_name</span><span class="si">}</span><span class="s2">' and test-set=</span><span class="si">{</span><span class="n">testset_name</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||
<span class="k">else</span><span class="p">:</span>
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'semeval'</span> <span class="ow">and</span> <span class="n">for_model_selection</span><span class="o">==</span><span class="kc">False</span><span class="p">:</span>
|
||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s1">'dataset "semeval" can only be used for model selection. '</span>
|
||
<span class="s1">'Use "semeval13", "semeval14", or "semeval15" for model evaluation.'</span><span class="p">)</span>
|
||
<span class="n">trainset_name</span> <span class="o">=</span> <span class="n">testset_name</span> <span class="o">=</span> <span class="n">dataset_name</span>
|
||
|
||
<span class="k">if</span> <span class="n">for_model_selection</span><span class="p">:</span>
|
||
<span class="n">train</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">,</span> <span class="s1">'train'</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">trainset_name</span><span class="si">}</span><span class="s1">.train.feature.txt'</span><span class="p">)</span>
|
||
<span class="n">test</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">,</span> <span class="s1">'test'</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">testset_name</span><span class="si">}</span><span class="s1">.dev.feature.txt'</span><span class="p">)</span>
|
||
<span class="k">else</span><span class="p">:</span>
|
||
<span class="n">train</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">,</span> <span class="s1">'train'</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">trainset_name</span><span class="si">}</span><span class="s1">.train+dev.feature.txt'</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'semeval16'</span><span class="p">:</span> <span class="c1"># there is a different test name in the case of semeval16 only</span>
|
||
<span class="n">test</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">,</span> <span class="s1">'test'</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">testset_name</span><span class="si">}</span><span class="s1">.dev-test.feature.txt'</span><span class="p">)</span>
|
||
<span class="k">else</span><span class="p">:</span>
|
||
<span class="n">test</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">,</span> <span class="s1">'test'</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">testset_name</span><span class="si">}</span><span class="s1">.test.feature.txt'</span><span class="p">)</span>
|
||
|
||
<span class="n">pickle_path</span> <span class="o">=</span> <span class="kc">None</span>
|
||
<span class="k">if</span> <span class="n">pickle</span><span class="p">:</span>
|
||
<span class="n">mode</span> <span class="o">=</span> <span class="s2">"train-dev"</span> <span class="k">if</span> <span class="n">for_model_selection</span> <span class="k">else</span> <span class="s2">"train+dev-test"</span>
|
||
<span class="n">pickle_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">,</span> <span class="s1">'pickle'</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">testset_name</span><span class="si">}</span><span class="s1">.</span><span class="si">{</span><span class="n">mode</span><span class="si">}</span><span class="s1">.pkl'</span><span class="p">)</span>
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">pickled_resource</span><span class="p">(</span><span class="n">pickle_path</span><span class="p">,</span> <span class="n">Dataset</span><span class="o">.</span><span class="n">load</span><span class="p">,</span> <span class="n">train</span><span class="p">,</span> <span class="n">test</span><span class="p">,</span> <span class="n">from_sparse</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">min_df</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||
<span class="n">reduce_columns</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="n">min_df</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
|
||
<span class="n">data</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">dataset_name</span>
|
||
|
||
<span class="k">return</span> <span class="n">data</span></div>
|
||
|
||
|
||
|
||
<div class="viewcode-block" id="fetch_UCIBinaryDataset">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.fetch_UCIBinaryDataset">[docs]</a>
|
||
<span class="k">def</span> <span class="nf">fetch_UCIBinaryDataset</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">,</span> <span class="n">data_home</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">test_split</span><span class="o">=</span><span class="mf">0.3</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dataset</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in</span>
|
||
<span class="sd"> `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).</span>
|
||
<span class="sd"> Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.</span>
|
||
<span class="sd"> Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_</span>
|
||
<span class="sd"> and</span>
|
||
<span class="sd"> `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).</span>
|
||
<span class="sd"> Dynamic ensemble selection for quantification tasks.</span>
|
||
<span class="sd"> Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.</span>
|
||
<span class="sd"> The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further</span>
|
||
<span class="sd"> information on how to use these collections), and so a train-test split is generated at desired proportion.</span>
|
||
<span class="sd"> The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`</span>
|
||
|
||
<span class="sd"> :param dataset_name: a dataset name</span>
|
||
<span class="sd"> :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default</span>
|
||
<span class="sd"> ~/quay_data/ directory)</span>
|
||
<span class="sd"> :param test_split: proportion of documents to be included in the test set. The rest conforms the training set</span>
|
||
<span class="sd"> :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets</span>
|
||
<span class="sd"> :return: a :class:`quapy.data.base.Dataset` instance</span>
|
||
<span class="sd"> """</span>
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">fetch_UCIBinaryLabelledCollection</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">,</span> <span class="n">data_home</span><span class="p">,</span> <span class="n">verbose</span><span class="p">)</span>
|
||
<span class="k">return</span> <span class="n">Dataset</span><span class="p">(</span><span class="o">*</span><span class="n">data</span><span class="o">.</span><span class="n">split_stratified</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">test_split</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="mi">0</span><span class="p">))</span></div>
|
||
|
||
|
||
|
||
<div class="viewcode-block" id="fetch_UCIBinaryLabelledCollection">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.fetch_UCIBinaryLabelledCollection">[docs]</a>
|
||
<span class="k">def</span> <span class="nf">fetch_UCIBinaryLabelledCollection</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">,</span> <span class="n">data_home</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">LabelledCollection</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in</span>
|
||
<span class="sd"> `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).</span>
|
||
<span class="sd"> Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.</span>
|
||
<span class="sd"> Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_</span>
|
||
<span class="sd"> and</span>
|
||
<span class="sd"> `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).</span>
|
||
<span class="sd"> Dynamic ensemble selection for quantification tasks.</span>
|
||
<span class="sd"> Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.</span>
|
||
<span class="sd"> The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation</span>
|
||
<span class="sd"> protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.</span>
|
||
<span class="sd"> This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:</span>
|
||
|
||
<span class="sd"> >>> import quapy as qp</span>
|
||
<span class="sd"> >>> collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast")</span>
|
||
<span class="sd"> >>> for data in qp.train.Dataset.kFCV(collection, nfolds=5, nrepeats=2):</span>
|
||
<span class="sd"> >>> ...</span>
|
||
|
||
<span class="sd"> The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`</span>
|
||
|
||
<span class="sd"> :param dataset_name: a dataset name</span>
|
||
<span class="sd"> :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default</span>
|
||
<span class="sd"> ~/quay_data/ directory)</span>
|
||
<span class="sd"> :param test_split: proportion of documents to be included in the test set. The rest conforms the training set</span>
|
||
<span class="sd"> :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets</span>
|
||
<span class="sd"> :return: a :class:`quapy.data.base.LabelledCollection` instance</span>
|
||
<span class="sd"> """</span>
|
||
|
||
<span class="k">assert</span> <span class="n">dataset_name</span> <span class="ow">in</span> <span class="n">UCI_BINARY_DATASETS</span><span class="p">,</span> \
|
||
<span class="sa">f</span><span class="s1">'Name </span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1"> does not match any known dataset from the UCI Machine Learning datasets repository. '</span> \
|
||
<span class="sa">f</span><span class="s1">'Valid ones are </span><span class="si">{</span><span class="n">UCI_BINARY_DATASETS</span><span class="si">}</span><span class="s1">'</span>
|
||
<span class="k">if</span> <span class="n">data_home</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||
<span class="n">data_home</span> <span class="o">=</span> <span class="n">get_quapy_home</span><span class="p">()</span>
|
||
|
||
<span class="n">dataset_fullname</span> <span class="o">=</span> <span class="p">{</span>
|
||
<span class="s1">'acute.a'</span><span class="p">:</span> <span class="s1">'Acute Inflammations (urinary bladder)'</span><span class="p">,</span>
|
||
<span class="s1">'acute.b'</span><span class="p">:</span> <span class="s1">'Acute Inflammations (renal pelvis)'</span><span class="p">,</span>
|
||
<span class="s1">'balance.1'</span><span class="p">:</span> <span class="s1">'Balance Scale Weight & Distance Database (left)'</span><span class="p">,</span>
|
||
<span class="s1">'balance.2'</span><span class="p">:</span> <span class="s1">'Balance Scale Weight & Distance Database (balanced)'</span><span class="p">,</span>
|
||
<span class="s1">'balance.3'</span><span class="p">:</span> <span class="s1">'Balance Scale Weight & Distance Database (right)'</span><span class="p">,</span>
|
||
<span class="s1">'breast-cancer'</span><span class="p">:</span> <span class="s1">'Breast Cancer Wisconsin (Original)'</span><span class="p">,</span>
|
||
<span class="s1">'cmc.1'</span><span class="p">:</span> <span class="s1">'Contraceptive Method Choice (no use)'</span><span class="p">,</span>
|
||
<span class="s1">'cmc.2'</span><span class="p">:</span> <span class="s1">'Contraceptive Method Choice (long term)'</span><span class="p">,</span>
|
||
<span class="s1">'cmc.3'</span><span class="p">:</span> <span class="s1">'Contraceptive Method Choice (short term)'</span><span class="p">,</span>
|
||
<span class="s1">'ctg.1'</span><span class="p">:</span> <span class="s1">'Cardiotocography Data Set (normal)'</span><span class="p">,</span>
|
||
<span class="s1">'ctg.2'</span><span class="p">:</span> <span class="s1">'Cardiotocography Data Set (suspect)'</span><span class="p">,</span>
|
||
<span class="s1">'ctg.3'</span><span class="p">:</span> <span class="s1">'Cardiotocography Data Set (pathologic)'</span><span class="p">,</span>
|
||
<span class="s1">'german'</span><span class="p">:</span> <span class="s1">'Statlog German Credit Data'</span><span class="p">,</span>
|
||
<span class="s1">'haberman'</span><span class="p">:</span> <span class="s2">"Haberman's Survival Data"</span><span class="p">,</span>
|
||
<span class="s1">'ionosphere'</span><span class="p">:</span> <span class="s1">'Johns Hopkins University Ionosphere DB'</span><span class="p">,</span>
|
||
<span class="s1">'iris.1'</span><span class="p">:</span> <span class="s1">'Iris Plants Database(x)'</span><span class="p">,</span>
|
||
<span class="s1">'iris.2'</span><span class="p">:</span> <span class="s1">'Iris Plants Database(versicolour)'</span><span class="p">,</span>
|
||
<span class="s1">'iris.3'</span><span class="p">:</span> <span class="s1">'Iris Plants Database(virginica)'</span><span class="p">,</span>
|
||
<span class="s1">'mammographic'</span><span class="p">:</span> <span class="s1">'Mammographic Mass'</span><span class="p">,</span>
|
||
<span class="s1">'pageblocks.5'</span><span class="p">:</span> <span class="s1">'Page Blocks Classification (5)'</span><span class="p">,</span>
|
||
<span class="s1">'semeion'</span><span class="p">:</span> <span class="s1">'Semeion Handwritten Digit (8)'</span><span class="p">,</span>
|
||
<span class="s1">'sonar'</span><span class="p">:</span> <span class="s1">'Sonar, Mines vs. Rocks'</span><span class="p">,</span>
|
||
<span class="s1">'spambase'</span><span class="p">:</span> <span class="s1">'Spambase Data Set'</span><span class="p">,</span>
|
||
<span class="s1">'spectf'</span><span class="p">:</span> <span class="s1">'SPECTF Heart Data'</span><span class="p">,</span>
|
||
<span class="s1">'tictactoe'</span><span class="p">:</span> <span class="s1">'Tic-Tac-Toe Endgame Database'</span><span class="p">,</span>
|
||
<span class="s1">'transfusion'</span><span class="p">:</span> <span class="s1">'Blood Transfusion Service Center Data Set'</span><span class="p">,</span>
|
||
<span class="s1">'wdbc'</span><span class="p">:</span> <span class="s1">'Wisconsin Diagnostic Breast Cancer'</span><span class="p">,</span>
|
||
<span class="s1">'wine.1'</span><span class="p">:</span> <span class="s1">'Wine Recognition Data (1)'</span><span class="p">,</span>
|
||
<span class="s1">'wine.2'</span><span class="p">:</span> <span class="s1">'Wine Recognition Data (2)'</span><span class="p">,</span>
|
||
<span class="s1">'wine.3'</span><span class="p">:</span> <span class="s1">'Wine Recognition Data (3)'</span><span class="p">,</span>
|
||
<span class="s1">'wine-q-red'</span><span class="p">:</span> <span class="s1">'Wine Quality Red (6-10)'</span><span class="p">,</span>
|
||
<span class="s1">'wine-q-white'</span><span class="p">:</span> <span class="s1">'Wine Quality White (6-10)'</span><span class="p">,</span>
|
||
<span class="s1">'yeast'</span><span class="p">:</span> <span class="s1">'Yeast'</span><span class="p">,</span>
|
||
<span class="p">}</span>
|
||
|
||
<span class="c1"># the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use</span>
|
||
<span class="c1"># to download the raw dataset</span>
|
||
<span class="n">identifier_map</span> <span class="o">=</span> <span class="p">{</span>
|
||
<span class="s1">'acute.a'</span><span class="p">:</span> <span class="s1">'acute'</span><span class="p">,</span>
|
||
<span class="s1">'acute.b'</span><span class="p">:</span> <span class="s1">'acute'</span><span class="p">,</span>
|
||
<span class="s1">'balance.1'</span><span class="p">:</span> <span class="s1">'balance-scale'</span><span class="p">,</span>
|
||
<span class="s1">'balance.2'</span><span class="p">:</span> <span class="s1">'balance-scale'</span><span class="p">,</span>
|
||
<span class="s1">'balance.3'</span><span class="p">:</span> <span class="s1">'balance-scale'</span><span class="p">,</span>
|
||
<span class="s1">'breast-cancer'</span><span class="p">:</span> <span class="s1">'breast-cancer-wisconsin'</span><span class="p">,</span>
|
||
<span class="s1">'cmc.1'</span><span class="p">:</span> <span class="s1">'cmc'</span><span class="p">,</span>
|
||
<span class="s1">'cmc.2'</span><span class="p">:</span> <span class="s1">'cmc'</span><span class="p">,</span>
|
||
<span class="s1">'cmc.3'</span><span class="p">:</span> <span class="s1">'cmc'</span><span class="p">,</span>
|
||
<span class="s1">'ctg.1'</span><span class="p">:</span> <span class="s1">'00193'</span><span class="p">,</span>
|
||
<span class="s1">'ctg.2'</span><span class="p">:</span> <span class="s1">'00193'</span><span class="p">,</span>
|
||
<span class="s1">'ctg.3'</span><span class="p">:</span> <span class="s1">'00193'</span><span class="p">,</span>
|
||
<span class="s1">'german'</span><span class="p">:</span> <span class="s1">'statlog/german'</span><span class="p">,</span>
|
||
<span class="s1">'haberman'</span><span class="p">:</span> <span class="s1">'haberman'</span><span class="p">,</span>
|
||
<span class="s1">'ionosphere'</span><span class="p">:</span> <span class="s1">'ionosphere'</span><span class="p">,</span>
|
||
<span class="s1">'iris.1'</span><span class="p">:</span> <span class="s1">'iris'</span><span class="p">,</span>
|
||
<span class="s1">'iris.2'</span><span class="p">:</span> <span class="s1">'iris'</span><span class="p">,</span>
|
||
<span class="s1">'iris.3'</span><span class="p">:</span> <span class="s1">'iris'</span><span class="p">,</span>
|
||
<span class="s1">'mammographic'</span><span class="p">:</span> <span class="s1">'mammographic-masses'</span><span class="p">,</span>
|
||
<span class="s1">'pageblocks.5'</span><span class="p">:</span> <span class="s1">'page-blocks'</span><span class="p">,</span>
|
||
<span class="s1">'semeion'</span><span class="p">:</span> <span class="s1">'semeion'</span><span class="p">,</span>
|
||
<span class="s1">'sonar'</span><span class="p">:</span> <span class="s1">'undocumented/connectionist-bench/sonar'</span><span class="p">,</span>
|
||
<span class="s1">'spambase'</span><span class="p">:</span> <span class="s1">'spambase'</span><span class="p">,</span>
|
||
<span class="s1">'spectf'</span><span class="p">:</span> <span class="s1">'spect'</span><span class="p">,</span>
|
||
<span class="s1">'tictactoe'</span><span class="p">:</span> <span class="s1">'tic-tac-toe'</span><span class="p">,</span>
|
||
<span class="s1">'transfusion'</span><span class="p">:</span> <span class="s1">'blood-transfusion'</span><span class="p">,</span>
|
||
<span class="s1">'wdbc'</span><span class="p">:</span> <span class="s1">'breast-cancer-wisconsin'</span><span class="p">,</span>
|
||
<span class="s1">'wine-q-red'</span><span class="p">:</span> <span class="s1">'wine-quality'</span><span class="p">,</span>
|
||
<span class="s1">'wine-q-white'</span><span class="p">:</span> <span class="s1">'wine-quality'</span><span class="p">,</span>
|
||
<span class="s1">'wine.1'</span><span class="p">:</span> <span class="s1">'wine'</span><span class="p">,</span>
|
||
<span class="s1">'wine.2'</span><span class="p">:</span> <span class="s1">'wine'</span><span class="p">,</span>
|
||
<span class="s1">'wine.3'</span><span class="p">:</span> <span class="s1">'wine'</span><span class="p">,</span>
|
||
<span class="s1">'yeast'</span><span class="p">:</span> <span class="s1">'yeast'</span><span class="p">,</span>
|
||
<span class="p">}</span>
|
||
|
||
<span class="c1"># the filename is the name of the file within the data_folder indexed by the identifier</span>
|
||
<span class="n">file_name</span> <span class="o">=</span> <span class="p">{</span>
|
||
<span class="s1">'acute'</span><span class="p">:</span> <span class="s1">'diagnosis.data'</span><span class="p">,</span>
|
||
<span class="s1">'00193'</span><span class="p">:</span> <span class="s1">'CTG.xls'</span><span class="p">,</span>
|
||
<span class="s1">'statlog/german'</span><span class="p">:</span> <span class="s1">'german.data-numeric'</span><span class="p">,</span>
|
||
<span class="s1">'mammographic-masses'</span><span class="p">:</span> <span class="s1">'mammographic_masses.data'</span><span class="p">,</span>
|
||
<span class="s1">'page-blocks'</span><span class="p">:</span> <span class="s1">'page-blocks.data.Z'</span><span class="p">,</span>
|
||
<span class="s1">'undocumented/connectionist-bench/sonar'</span><span class="p">:</span> <span class="s1">'sonar.all-data'</span><span class="p">,</span>
|
||
<span class="s1">'spect'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'SPECTF.train'</span><span class="p">,</span> <span class="s1">'SPECTF.test'</span><span class="p">],</span>
|
||
<span class="s1">'blood-transfusion'</span><span class="p">:</span> <span class="s1">'transfusion.data'</span><span class="p">,</span>
|
||
<span class="s1">'wine-quality'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'winequality-red.csv'</span><span class="p">,</span> <span class="s1">'winequality-white.csv'</span><span class="p">],</span>
|
||
<span class="s1">'breast-cancer-wisconsin'</span><span class="p">:</span> <span class="s1">'breast-cancer-wisconsin.data'</span> <span class="k">if</span> <span class="n">dataset_name</span><span class="o">==</span><span class="s1">'breast-cancer'</span> <span class="k">else</span> <span class="s1">'wdbc.data'</span>
|
||
<span class="p">}</span>
|
||
|
||
<span class="c1"># the filename containing the dataset description (if any)</span>
|
||
<span class="n">desc_name</span> <span class="o">=</span> <span class="p">{</span>
|
||
<span class="s1">'acute'</span><span class="p">:</span> <span class="s1">'diagnosis.names'</span><span class="p">,</span>
|
||
<span class="s1">'00193'</span><span class="p">:</span> <span class="kc">None</span><span class="p">,</span>
|
||
<span class="s1">'statlog/german'</span><span class="p">:</span> <span class="s1">'german.doc'</span><span class="p">,</span>
|
||
<span class="s1">'mammographic-masses'</span><span class="p">:</span> <span class="s1">'mammographic_masses.names'</span><span class="p">,</span>
|
||
<span class="s1">'undocumented/connectionist-bench/sonar'</span><span class="p">:</span> <span class="s1">'sonar.names'</span><span class="p">,</span>
|
||
<span class="s1">'spect'</span><span class="p">:</span> <span class="s1">'SPECTF.names'</span><span class="p">,</span>
|
||
<span class="s1">'blood-transfusion'</span><span class="p">:</span> <span class="s1">'transfusion.names'</span><span class="p">,</span>
|
||
<span class="s1">'wine-quality'</span><span class="p">:</span> <span class="s1">'winequality.names'</span><span class="p">,</span>
|
||
<span class="s1">'breast-cancer-wisconsin'</span><span class="p">:</span> <span class="s1">'breast-cancer-wisconsin.names'</span> <span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'breast-cancer'</span> <span class="k">else</span> <span class="s1">'wdbc.names'</span>
|
||
<span class="p">}</span>
|
||
|
||
<span class="n">identifier</span> <span class="o">=</span> <span class="n">identifier_map</span><span class="p">[</span><span class="n">dataset_name</span><span class="p">]</span>
|
||
<span class="n">filename</span> <span class="o">=</span> <span class="n">file_name</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">identifier</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">identifier</span><span class="si">}</span><span class="s1">.data'</span><span class="p">)</span>
|
||
<span class="n">descfile</span> <span class="o">=</span> <span class="n">desc_name</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">identifier</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">identifier</span><span class="si">}</span><span class="s1">.names'</span><span class="p">)</span>
|
||
<span class="n">fullname</span> <span class="o">=</span> <span class="n">dataset_fullname</span><span class="p">[</span><span class="n">dataset_name</span><span class="p">]</span>
|
||
|
||
<span class="n">URL</span> <span class="o">=</span> <span class="sa">f</span><span class="s1">'http://archive.ics.uci.edu/ml/machine-learning-databases/</span><span class="si">{</span><span class="n">identifier</span><span class="si">}</span><span class="s1">'</span>
|
||
<span class="n">data_dir</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'uci_datasets'</span><span class="p">,</span> <span class="n">identifier</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> <span class="c1"># filename could be a list of files, in which case it will be processed later</span>
|
||
<span class="n">data_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_dir</span><span class="p">,</span> <span class="n">filename</span><span class="p">)</span>
|
||
<span class="n">download_file_if_not_exists</span><span class="p">(</span><span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">URL</span><span class="si">}</span><span class="s1">/</span><span class="si">{</span><span class="n">filename</span><span class="si">}</span><span class="s1">'</span><span class="p">,</span> <span class="n">data_path</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">descfile</span><span class="p">:</span>
|
||
<span class="k">try</span><span class="p">:</span>
|
||
<span class="n">download_file_if_not_exists</span><span class="p">(</span><span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">URL</span><span class="si">}</span><span class="s1">/</span><span class="si">{</span><span class="n">descfile</span><span class="si">}</span><span class="s1">'</span><span class="p">,</span> <span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">data_dir</span><span class="si">}</span><span class="s1">/</span><span class="si">{</span><span class="n">descfile</span><span class="si">}</span><span class="s1">'</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="n">verbose</span><span class="p">:</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="nb">open</span><span class="p">(</span><span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">data_dir</span><span class="si">}</span><span class="s1">/</span><span class="si">{</span><span class="n">descfile</span><span class="si">}</span><span class="s1">'</span><span class="p">,</span> <span class="s1">'rt'</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">())</span>
|
||
<span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="s1">'could not read the description file'</span><span class="p">)</span>
|
||
<span class="k">elif</span> <span class="n">verbose</span><span class="p">:</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="s1">'no file description available'</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">verbose</span><span class="p">:</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s1">'Loading </span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1"> (</span><span class="si">{</span><span class="n">fullname</span><span class="si">}</span><span class="s1">)'</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'acute'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s1">'utf-16'</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">'</span><span class="se">\t</span><span class="s1">'</span><span class="p">)</span>
|
||
|
||
<span class="n">df</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="nb">float</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">','</span><span class="p">,</span> <span class="s1">'.'</span><span class="p">)))</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
||
<span class="p">[</span><span class="n">_df_replace</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">6</span><span class="p">)]</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">5</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'acute.a'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="mi">6</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'yes'</span><span class="p">)</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'acute.b'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="mi">7</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'yes'</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'balance-scale'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'balance.1'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'L'</span><span class="p">)</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'balance.2'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'B'</span><span class="p">)</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'balance.3'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'R'</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="mi">1</span><span class="p">:]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'breast-cancer-wisconsin'</span> <span class="ow">and</span> <span class="n">dataset_name</span><span class="o">==</span><span class="s1">'breast-cancer'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">Xy</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="mi">1</span><span class="p">:</span><span class="mi">10</span><span class="p">]</span>
|
||
<span class="n">Xy</span><span class="p">[</span><span class="n">Xy</span><span class="o">==</span><span class="s1">'?'</span><span class="p">]</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span>
|
||
<span class="n">Xy</span> <span class="o">=</span> <span class="n">Xy</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">Xy</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="mi">1</span><span class="p">:</span><span class="mi">9</span><span class="p">]</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">X</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">Xy</span><span class="p">[</span><span class="mi">10</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'breast-cancer-wisconsin'</span> <span class="ow">and</span> <span class="n">dataset_name</span><span class="o">==</span><span class="s1">'wdbc'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="mi">2</span><span class="p">:</span><span class="mi">32</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'M'</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'cmc'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">8</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">9</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'cmc.1'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'cmc.2'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'cmc.3'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'00193'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Data'</span><span class="p">,</span> <span class="n">skipfooter</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="mi">24</span><span class="p">))]</span> <span class="c1"># select columns numbered (number 23 is the target label)</span>
|
||
<span class="c1"># replaces the header with the first row</span>
|
||
<span class="n">new_header</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># grab the first row for the header</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span> <span class="c1"># take the data less the header row</span>
|
||
<span class="n">df</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">new_header</span> <span class="c1"># set the header row as the df header</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">22</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="s1">'NSP'</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'ctg.1'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span> <span class="c1"># 1==Normal</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'ctg.2'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span> <span class="c1"># 2==Suspect</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'ctg.3'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span> <span class="c1"># 3==Pathologic</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'statlog/german'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">24</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">24</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'haberman'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">3</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'ionosphere'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">34</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">34</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'b'</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'iris'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">4</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">4</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'iris.1'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'Iris-setosa'</span><span class="p">)</span> <span class="c1"># 1==Setosa</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'iris.2'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'Iris-versicolor'</span><span class="p">)</span> <span class="c1"># 2==Versicolor</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'iris.3'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'Iris-virginica'</span><span class="p">)</span> <span class="c1"># 3==Virginica</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'mammographic-masses'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">df</span><span class="p">[</span><span class="n">df</span> <span class="o">==</span> <span class="s1">'?'</span><span class="p">]</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">nan</span>
|
||
<span class="n">Xy</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">Xy</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">5</span><span class="p">]</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">X</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">Xy</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span><span class="mi">5</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'page-blocks'</span><span class="p">:</span>
|
||
<span class="n">data_path_</span> <span class="o">=</span> <span class="n">data_path</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'.Z'</span><span class="p">,</span> <span class="s1">''</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">data_path_</span><span class="p">):</span>
|
||
<span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span><span class="sa">f</span><span class="s1">'Warning: file </span><span class="si">{</span><span class="n">data_path_</span><span class="si">}</span><span class="s1"> does not exist. If this is the first time you '</span>
|
||
<span class="sa">f</span><span class="s1">'attempt to load this dataset, then you have to manually unzip the </span><span class="si">{</span><span class="n">data_path</span><span class="si">}</span><span class="s1"> '</span>
|
||
<span class="sa">f</span><span class="s1">'and name the extracted file </span><span class="si">{</span><span class="n">data_path_</span><span class="si">}</span><span class="s1"> (unfortunately, neither zipfile, nor '</span>
|
||
<span class="sa">f</span><span class="s1">'gzip can handle unix compressed files automatically -- there is a repo in GitHub '</span>
|
||
<span class="sa">f</span><span class="s1">'https://github.com/umeat/unlzw where the problem seems to be solved anyway).'</span><span class="p">)</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path_</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">10</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">10</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span> <span class="c1"># 5==block "graphic"</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'semeion'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span> <span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">256</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">263</span><span class="p">]</span><span class="o">.</span><span class="n">values</span> <span class="c1"># 263 stands for digit 8 (labels are one-hot vectors from col 256-266)</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'undocumented/connectionist-bench/sonar'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">60</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">60</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'R'</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'spambase'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">57</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">57</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'spect'</span><span class="p">:</span>
|
||
<span class="n">dfs</span> <span class="o">=</span> <span class="p">[]</span>
|
||
<span class="k">for</span> <span class="n">file</span> <span class="ow">in</span> <span class="n">filename</span><span class="p">:</span>
|
||
<span class="n">data_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_dir</span><span class="p">,</span> <span class="n">file</span><span class="p">)</span>
|
||
<span class="n">download_file_if_not_exists</span><span class="p">(</span><span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">URL</span><span class="si">}</span><span class="s1">/</span><span class="si">{</span><span class="n">file</span><span class="si">}</span><span class="s1">'</span><span class="p">,</span> <span class="n">data_path</span><span class="p">)</span>
|
||
<span class="n">dfs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">))</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">(</span><span class="n">dfs</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">1</span><span class="p">:</span><span class="mi">45</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'tic-tac-toe'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">9</span><span class="p">]</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'o'</span><span class="p">,</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'b'</span><span class="p">,</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'x'</span><span class="p">,</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">9</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'negative'</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'blood-transfusion'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">4</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">4</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'wine'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">','</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">1</span><span class="p">:</span><span class="mi">14</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="k">if</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'wine.1'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'wine.2'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
|
||
<span class="k">elif</span> <span class="n">dataset_name</span> <span class="o">==</span> <span class="s1">'wine.3'</span><span class="p">:</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'wine-quality'</span><span class="p">:</span>
|
||
<span class="n">filename</span> <span class="o">=</span> <span class="n">filename</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">dataset_name</span><span class="o">==</span><span class="s1">'wine-q-red'</span> <span class="k">else</span> <span class="n">filename</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
|
||
<span class="n">data_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_dir</span><span class="p">,</span> <span class="n">filename</span><span class="p">)</span>
|
||
<span class="n">download_file_if_not_exists</span><span class="p">(</span><span class="sa">f</span><span class="s1">'</span><span class="si">{</span><span class="n">URL</span><span class="si">}</span><span class="s1">/</span><span class="si">{</span><span class="n">filename</span><span class="si">}</span><span class="s1">'</span><span class="p">,</span> <span class="n">data_path</span><span class="p">)</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">sep</span><span class="o">=</span><span class="s1">';'</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">0</span><span class="p">:</span><span class="mi">11</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">11</span><span class="p">]</span><span class="o">.</span><span class="n">values</span> <span class="o">></span> <span class="mi">5</span>
|
||
|
||
<span class="k">if</span> <span class="n">identifier</span> <span class="o">==</span> <span class="s1">'yeast'</span><span class="p">:</span>
|
||
<span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">data_path</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">delim_whitespace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
<span class="n">X</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">1</span><span class="p">:</span><span class="mi">9</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:,</span> <span class="mi">9</span><span class="p">]</span><span class="o">.</span><span class="n">values</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">binarize</span><span class="p">(</span><span class="n">y</span><span class="p">,</span> <span class="n">pos_class</span><span class="o">=</span><span class="s1">'NUC'</span><span class="p">)</span>
|
||
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">LabelledCollection</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="n">verbose</span><span class="p">:</span>
|
||
<span class="n">data</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
|
||
<span class="k">return</span> <span class="n">data</span></div>
|
||
|
||
|
||
|
||
<div class="viewcode-block" id="fetch_UCIMulticlassDataset">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.fetch_UCIMulticlassDataset">[docs]</a>
|
||
<span class="k">def</span> <span class="nf">fetch_UCIMulticlassDataset</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">,</span> <span class="n">data_home</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">test_split</span><span class="o">=</span><span class="mf">0.3</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dataset</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. </span>
|
||
|
||
<span class="sd"> The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:</span>
|
||
<span class="sd"> - It has more than 1000 instances</span>
|
||
<span class="sd"> - It is suited for classification</span>
|
||
<span class="sd"> - It has more than two classes</span>
|
||
<span class="sd"> - It is available for Python import (requires ucimlrepo package)</span>
|
||
|
||
<span class="sd"> >>> import quapy as qp</span>
|
||
<span class="sd"> >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")</span>
|
||
<span class="sd"> >>> train, test = dataset.train_test</span>
|
||
<span class="sd"> >>> ...</span>
|
||
|
||
<span class="sd"> The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`</span>
|
||
|
||
<span class="sd"> The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.</span>
|
||
|
||
<span class="sd"> :param dataset_name: a dataset name</span>
|
||
<span class="sd"> :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default</span>
|
||
<span class="sd"> ~/quay_data/ directory)</span>
|
||
<span class="sd"> :param test_split: proportion of documents to be included in the test set. The rest conforms the training set</span>
|
||
<span class="sd"> :param verbose: set to True (default is False) to get information (stats) about the dataset</span>
|
||
<span class="sd"> :return: a :class:`quapy.data.base.Dataset` instance</span>
|
||
<span class="sd"> """</span>
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">fetch_UCIMulticlassLabelledCollection</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">,</span> <span class="n">data_home</span><span class="p">,</span> <span class="n">verbose</span><span class="p">)</span>
|
||
<span class="k">return</span> <span class="n">Dataset</span><span class="p">(</span><span class="o">*</span><span class="n">data</span><span class="o">.</span><span class="n">split_stratified</span><span class="p">(</span><span class="mi">1</span> <span class="o">-</span> <span class="n">test_split</span><span class="p">,</span> <span class="n">random_state</span><span class="o">=</span><span class="mi">0</span><span class="p">))</span></div>
|
||
|
||
|
||
|
||
<div class="viewcode-block" id="fetch_UCIMulticlassLabelledCollection">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.fetch_UCIMulticlassLabelledCollection">[docs]</a>
|
||
<span class="k">def</span> <span class="nf">fetch_UCIMulticlassLabelledCollection</span><span class="p">(</span><span class="n">dataset_name</span><span class="p">,</span> <span class="n">data_home</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">verbose</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">LabelledCollection</span><span class="p">:</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.</span>
|
||
|
||
<span class="sd"> The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:</span>
|
||
<span class="sd"> - It has more than 1000 instances</span>
|
||
<span class="sd"> - It is suited for classification</span>
|
||
<span class="sd"> - It has more than two classes</span>
|
||
<span class="sd"> - It is available for Python import (requires ucimlrepo package)</span>
|
||
<span class="sd"> </span>
|
||
<span class="sd"> >>> import quapy as qp</span>
|
||
<span class="sd"> >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")</span>
|
||
<span class="sd"> >>> X, y = collection.Xy</span>
|
||
<span class="sd"> >>> ...</span>
|
||
|
||
<span class="sd"> The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`</span>
|
||
|
||
<span class="sd"> The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.</span>
|
||
|
||
<span class="sd"> :param dataset_name: a dataset name</span>
|
||
<span class="sd"> :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default</span>
|
||
<span class="sd"> ~/quay_data/ directory)</span>
|
||
<span class="sd"> :param test_split: proportion of documents to be included in the test set. The rest conforms the training set</span>
|
||
<span class="sd"> :param verbose: set to True (default is False) to get information (stats) about the dataset</span>
|
||
<span class="sd"> :return: a :class:`quapy.data.base.LabelledCollection` instance</span>
|
||
<span class="sd"> """</span>
|
||
<span class="k">assert</span> <span class="n">dataset_name</span> <span class="ow">in</span> <span class="n">UCI_MULTICLASS_DATASETS</span><span class="p">,</span> \
|
||
<span class="sa">f</span><span class="s1">'Name </span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1"> does not match any known dataset from the '</span> \
|
||
<span class="sa">f</span><span class="s1">'UCI Machine Learning datasets repository (multiclass). '</span> \
|
||
<span class="sa">f</span><span class="s1">'Valid ones are </span><span class="si">{</span><span class="n">UCI_MULTICLASS_DATASETS</span><span class="si">}</span><span class="s1">'</span>
|
||
|
||
<span class="k">if</span> <span class="n">data_home</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||
<span class="n">data_home</span> <span class="o">=</span> <span class="n">get_quapy_home</span><span class="p">()</span>
|
||
|
||
<span class="n">identifiers</span> <span class="o">=</span> <span class="p">{</span>
|
||
<span class="s2">"dry-bean"</span><span class="p">:</span> <span class="mi">602</span><span class="p">,</span>
|
||
<span class="s2">"wine-quality"</span><span class="p">:</span> <span class="mi">186</span><span class="p">,</span>
|
||
<span class="s2">"academic-success"</span><span class="p">:</span> <span class="mi">697</span><span class="p">,</span>
|
||
<span class="s2">"digits"</span><span class="p">:</span> <span class="mi">80</span><span class="p">,</span>
|
||
<span class="s2">"letter"</span><span class="p">:</span> <span class="mi">59</span>
|
||
<span class="p">}</span>
|
||
|
||
<span class="n">full_names</span> <span class="o">=</span> <span class="p">{</span>
|
||
<span class="s2">"dry-bean"</span><span class="p">:</span> <span class="s2">"Dry Bean Dataset"</span><span class="p">,</span>
|
||
<span class="s2">"wine-quality"</span><span class="p">:</span> <span class="s2">"Wine Quality"</span><span class="p">,</span>
|
||
<span class="s2">"academic-success"</span><span class="p">:</span> <span class="s2">"Predict students' dropout and academic success"</span><span class="p">,</span>
|
||
<span class="s2">"digits"</span><span class="p">:</span> <span class="s2">"Optical Recognition of Handwritten Digits"</span><span class="p">,</span>
|
||
<span class="s2">"letter"</span><span class="p">:</span> <span class="s2">"Letter Recognition"</span>
|
||
<span class="p">}</span>
|
||
|
||
<span class="n">identifier</span> <span class="o">=</span> <span class="n">identifiers</span><span class="p">[</span><span class="n">dataset_name</span><span class="p">]</span>
|
||
<span class="n">fullname</span> <span class="o">=</span> <span class="n">full_names</span><span class="p">[</span><span class="n">dataset_name</span><span class="p">]</span>
|
||
|
||
<span class="k">if</span> <span class="n">verbose</span><span class="p">:</span>
|
||
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s1">'Loading UCI Muticlass </span><span class="si">{</span><span class="n">dataset_name</span><span class="si">}</span><span class="s1"> (</span><span class="si">{</span><span class="n">fullname</span><span class="si">}</span><span class="s1">)'</span><span class="p">)</span>
|
||
|
||
<span class="n">file</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'uci_multiclass'</span><span class="p">,</span> <span class="n">dataset_name</span><span class="o">+</span><span class="s1">'.pkl'</span><span class="p">)</span>
|
||
|
||
<span class="k">def</span> <span class="nf">download</span><span class="p">(</span><span class="nb">id</span><span class="p">):</span>
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">fetch_ucirepo</span><span class="p">(</span><span class="nb">id</span><span class="o">=</span><span class="nb">id</span><span class="p">)</span>
|
||
<span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">data</span><span class="p">[</span><span class="s1">'data'</span><span class="p">][</span><span class="s1">'features'</span><span class="p">]</span><span class="o">.</span><span class="n">to_numpy</span><span class="p">(),</span> <span class="n">data</span><span class="p">[</span><span class="s1">'data'</span><span class="p">][</span><span class="s1">'targets'</span><span class="p">]</span><span class="o">.</span><span class="n">to_numpy</span><span class="p">()</span><span class="o">.</span><span class="n">squeeze</span><span class="p">()</span>
|
||
<span class="n">classes</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">unique</span><span class="p">(</span><span class="n">y</span><span class="p">))</span>
|
||
<span class="n">y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">searchsorted</span><span class="p">(</span><span class="n">classes</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span>
|
||
<span class="k">return</span> <span class="n">LabelledCollection</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span>
|
||
|
||
<span class="n">data</span> <span class="o">=</span> <span class="n">pickled_resource</span><span class="p">(</span><span class="n">file</span><span class="p">,</span> <span class="n">download</span><span class="p">,</span> <span class="n">identifier</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">verbose</span><span class="p">:</span>
|
||
<span class="n">data</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
|
||
|
||
<span class="k">return</span> <span class="n">data</span></div>
|
||
|
||
|
||
|
||
<span class="k">def</span> <span class="nf">_df_replace</span><span class="p">(</span><span class="n">df</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">repl</span><span class="o">=</span><span class="p">{</span><span class="s1">'yes'</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span> <span class="s1">'no'</span><span class="p">:</span><span class="mi">0</span><span class="p">},</span> <span class="n">astype</span><span class="o">=</span><span class="nb">float</span><span class="p">):</span>
|
||
<span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">df</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="n">repl</span><span class="p">[</span><span class="n">x</span><span class="p">])</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">astype</span><span class="p">,</span> <span class="n">copy</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
|
||
|
||
|
||
<div class="viewcode-block" id="fetch_lequa2022">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.fetch_lequa2022">[docs]</a>
|
||
<span class="k">def</span> <span class="nf">fetch_lequa2022</span><span class="p">(</span><span class="n">task</span><span class="p">,</span> <span class="n">data_home</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.</span>
|
||
<span class="sd"> In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification</span>
|
||
<span class="sd"> problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.</span>
|
||
<span class="sd"> Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification</span>
|
||
<span class="sd"> problems consisting of estimating the class prevalence values of 28 different merchandise products.</span>
|
||
<span class="sd"> We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).</span>
|
||
<span class="sd"> A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.</span>
|
||
<span class="sd"> <https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description</span>
|
||
<span class="sd"> on the tasks and datasets.</span>
|
||
|
||
<span class="sd"> The datasets are downloaded only once, and stored for fast reuse.</span>
|
||
|
||
<span class="sd"> See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these</span>
|
||
<span class="sd"> datasets.</span>
|
||
|
||
|
||
<span class="sd"> :param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B</span>
|
||
<span class="sd"> :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default</span>
|
||
<span class="sd"> ~/quay_data/ directory)</span>
|
||
<span class="sd"> :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of</span>
|
||
<span class="sd"> :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of</span>
|
||
<span class="sd"> :class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,</span>
|
||
<span class="sd"> that return a series of samples stored in a directory which are labelled by prevalence.</span>
|
||
<span class="sd"> """</span>
|
||
|
||
<span class="kn">from</span> <span class="nn">quapy.data._lequa2022</span> <span class="kn">import</span> <span class="n">load_raw_documents</span><span class="p">,</span> <span class="n">load_vector_documents</span><span class="p">,</span> <span class="n">SamplesFromDir</span>
|
||
|
||
<span class="k">assert</span> <span class="n">task</span> <span class="ow">in</span> <span class="n">LEQUA2022_TASKS</span><span class="p">,</span> \
|
||
<span class="sa">f</span><span class="s1">'Unknown task </span><span class="si">{</span><span class="n">task</span><span class="si">}</span><span class="s1">. Valid ones are </span><span class="si">{</span><span class="n">LEQUA2022_TASKS</span><span class="si">}</span><span class="s1">'</span>
|
||
<span class="k">if</span> <span class="n">data_home</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||
<span class="n">data_home</span> <span class="o">=</span> <span class="n">get_quapy_home</span><span class="p">()</span>
|
||
|
||
<span class="n">URL_TRAINDEV</span><span class="o">=</span><span class="sa">f</span><span class="s1">'https://zenodo.org/record/6546188/files/</span><span class="si">{</span><span class="n">task</span><span class="si">}</span><span class="s1">.train_dev.zip'</span>
|
||
<span class="n">URL_TEST</span><span class="o">=</span><span class="sa">f</span><span class="s1">'https://zenodo.org/record/6546188/files/</span><span class="si">{</span><span class="n">task</span><span class="si">}</span><span class="s1">.test.zip'</span>
|
||
<span class="n">URL_TEST_PREV</span><span class="o">=</span><span class="sa">f</span><span class="s1">'https://zenodo.org/record/6546188/files/</span><span class="si">{</span><span class="n">task</span><span class="si">}</span><span class="s1">.test_prevalences.zip'</span>
|
||
|
||
<span class="n">lequa_dir</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'lequa2022'</span><span class="p">)</span>
|
||
<span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
|
||
<span class="k">def</span> <span class="nf">download_unzip_and_remove</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">,</span> <span class="n">url</span><span class="p">):</span>
|
||
<span class="n">tmp_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">task</span> <span class="o">+</span> <span class="s1">'_tmp.zip'</span><span class="p">)</span>
|
||
<span class="n">download_file_if_not_exists</span><span class="p">(</span><span class="n">url</span><span class="p">,</span> <span class="n">tmp_path</span><span class="p">)</span>
|
||
<span class="k">with</span> <span class="n">zipfile</span><span class="o">.</span><span class="n">ZipFile</span><span class="p">(</span><span class="n">tmp_path</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
|
||
<span class="n">file</span><span class="o">.</span><span class="n">extractall</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">)</span>
|
||
<span class="n">os</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">tmp_path</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">join</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">task</span><span class="p">)):</span>
|
||
<span class="n">download_unzip_and_remove</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">URL_TRAINDEV</span><span class="p">)</span>
|
||
<span class="n">download_unzip_and_remove</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">URL_TEST</span><span class="p">)</span>
|
||
<span class="n">download_unzip_and_remove</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">URL_TEST_PREV</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">task</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'T1A'</span><span class="p">,</span> <span class="s1">'T1B'</span><span class="p">]:</span>
|
||
<span class="n">load_fn</span> <span class="o">=</span> <span class="n">load_vector_documents</span>
|
||
<span class="k">elif</span> <span class="n">task</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">'T2A'</span><span class="p">,</span> <span class="s1">'T2B'</span><span class="p">]:</span>
|
||
<span class="n">load_fn</span> <span class="o">=</span> <span class="n">load_raw_documents</span>
|
||
|
||
<span class="n">tr_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">task</span><span class="p">,</span> <span class="s1">'public'</span><span class="p">,</span> <span class="s1">'training_data.txt'</span><span class="p">)</span>
|
||
<span class="n">train</span> <span class="o">=</span> <span class="n">LabelledCollection</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">tr_path</span><span class="p">,</span> <span class="n">loader_func</span><span class="o">=</span><span class="n">load_fn</span><span class="p">)</span>
|
||
|
||
<span class="n">val_samples_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">task</span><span class="p">,</span> <span class="s1">'public'</span><span class="p">,</span> <span class="s1">'dev_samples'</span><span class="p">)</span>
|
||
<span class="n">val_true_prev_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">task</span><span class="p">,</span> <span class="s1">'public'</span><span class="p">,</span> <span class="s1">'dev_prevalences.txt'</span><span class="p">)</span>
|
||
<span class="n">val_gen</span> <span class="o">=</span> <span class="n">SamplesFromDir</span><span class="p">(</span><span class="n">val_samples_path</span><span class="p">,</span> <span class="n">val_true_prev_path</span><span class="p">,</span> <span class="n">load_fn</span><span class="o">=</span><span class="n">load_fn</span><span class="p">)</span>
|
||
|
||
<span class="n">test_samples_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">task</span><span class="p">,</span> <span class="s1">'public'</span><span class="p">,</span> <span class="s1">'test_samples'</span><span class="p">)</span>
|
||
<span class="n">test_true_prev_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">lequa_dir</span><span class="p">,</span> <span class="n">task</span><span class="p">,</span> <span class="s1">'public'</span><span class="p">,</span> <span class="s1">'test_prevalences.txt'</span><span class="p">)</span>
|
||
<span class="n">test_gen</span> <span class="o">=</span> <span class="n">SamplesFromDir</span><span class="p">(</span><span class="n">test_samples_path</span><span class="p">,</span> <span class="n">test_true_prev_path</span><span class="p">,</span> <span class="n">load_fn</span><span class="o">=</span><span class="n">load_fn</span><span class="p">)</span>
|
||
|
||
<span class="k">return</span> <span class="n">train</span><span class="p">,</span> <span class="n">val_gen</span><span class="p">,</span> <span class="n">test_gen</span></div>
|
||
|
||
|
||
|
||
<div class="viewcode-block" id="fetch_IFCB">
|
||
<a class="viewcode-back" href="../../../quapy.data.html#quapy.data.datasets.fetch_IFCB">[docs]</a>
|
||
<span class="k">def</span> <span class="nf">fetch_IFCB</span><span class="p">(</span><span class="n">single_sample_train</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">for_model_selection</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">data_home</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
|
||
<span class="w"> </span><span class="sd">"""</span>
|
||
<span class="sd"> Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more</span>
|
||
<span class="sd"> information on this dataset, please follow the zenodo link).</span>
|
||
<span class="sd"> This dataset is based on the data available publicly at</span>
|
||
<span class="sd"> `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.</span>
|
||
<span class="sd"> The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.</span>
|
||
<span class="sd"> Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.</span>
|
||
|
||
<span class="sd"> The datasets are downloaded only once, and stored for fast reuse.</span>
|
||
|
||
<span class="sd"> :param single_sample_train: a boolean. If true, it will return the train dataset as a</span>
|
||
<span class="sd"> :class:`quapy.data.base.LabelledCollection` (all examples together).</span>
|
||
<span class="sd"> If false, a generator of training samples will be returned. Each example in the training set has an individual label.</span>
|
||
<span class="sd"> :param for_model_selection: if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection; </span>
|
||
<span class="sd"> if False, then returns the full training set as training set and the test set as the test set</span>
|
||
<span class="sd"> :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default</span>
|
||
<span class="sd"> ~/quay_data/ directory)</span>
|
||
<span class="sd"> :return: a tuple `(train, test_gen)` where `train` is an instance of</span>
|
||
<span class="sd"> :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is true or</span>
|
||
<span class="sd"> :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir`, i.e. a sampling protocol that returns a series of samples</span>
|
||
<span class="sd"> labelled example by example. test_gen will be a :class:`quapy.data._ifcb.IFCBTestSamples`, </span>
|
||
<span class="sd"> i.e., a sampling protocol that returns a series of samples labelled by prevalence.</span>
|
||
<span class="sd"> """</span>
|
||
|
||
<span class="kn">from</span> <span class="nn">quapy.data._ifcb</span> <span class="kn">import</span> <span class="n">IFCBTrainSamplesFromDir</span><span class="p">,</span> <span class="n">IFCBTestSamples</span><span class="p">,</span> <span class="n">get_sample_list</span><span class="p">,</span> <span class="n">generate_modelselection_split</span>
|
||
|
||
<span class="k">if</span> <span class="n">data_home</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||
<span class="n">data_home</span> <span class="o">=</span> <span class="n">get_quapy_home</span><span class="p">()</span>
|
||
|
||
<span class="n">URL_TRAIN</span><span class="o">=</span><span class="sa">f</span><span class="s1">'https://zenodo.org/records/10036244/files/IFCB.train.zip'</span>
|
||
<span class="n">URL_TEST</span><span class="o">=</span><span class="sa">f</span><span class="s1">'https://zenodo.org/records/10036244/files/IFCB.test.zip'</span>
|
||
<span class="n">URL_TEST_PREV</span><span class="o">=</span><span class="sa">f</span><span class="s1">'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'</span>
|
||
|
||
<span class="n">ifcb_dir</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">data_home</span><span class="p">,</span> <span class="s1">'ifcb'</span><span class="p">)</span>
|
||
<span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||
|
||
<span class="k">def</span> <span class="nf">download_unzip_and_remove</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">,</span> <span class="n">url</span><span class="p">):</span>
|
||
<span class="n">tmp_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span> <span class="s1">'ifcb_tmp.zip'</span><span class="p">)</span>
|
||
<span class="n">download_file_if_not_exists</span><span class="p">(</span><span class="n">url</span><span class="p">,</span> <span class="n">tmp_path</span><span class="p">)</span>
|
||
<span class="k">with</span> <span class="n">zipfile</span><span class="o">.</span><span class="n">ZipFile</span><span class="p">(</span><span class="n">tmp_path</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
|
||
<span class="n">file</span><span class="o">.</span><span class="n">extractall</span><span class="p">(</span><span class="n">unzipped_path</span><span class="p">)</span>
|
||
<span class="n">os</span><span class="o">.</span><span class="n">remove</span><span class="p">(</span><span class="n">tmp_path</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span><span class="s1">'train'</span><span class="p">)):</span>
|
||
<span class="n">download_unzip_and_remove</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span> <span class="n">URL_TRAIN</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span><span class="s1">'test'</span><span class="p">)):</span>
|
||
<span class="n">download_unzip_and_remove</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span> <span class="n">URL_TEST</span><span class="p">)</span>
|
||
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span><span class="s1">'test_prevalences.csv'</span><span class="p">)):</span>
|
||
<span class="n">download_unzip_and_remove</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span> <span class="n">URL_TEST_PREV</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Load test prevalences and classes</span>
|
||
<span class="n">test_true_prev_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span> <span class="s1">'test_prevalences.csv'</span><span class="p">)</span>
|
||
<span class="n">test_true_prev</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="n">test_true_prev_path</span><span class="p">)</span>
|
||
<span class="n">classes</span> <span class="o">=</span> <span class="n">test_true_prev</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span>
|
||
|
||
<span class="c1">#Load train and test samples</span>
|
||
<span class="n">train_samples_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span><span class="s1">'train'</span><span class="p">)</span>
|
||
<span class="n">test_samples_path</span> <span class="o">=</span> <span class="n">join</span><span class="p">(</span><span class="n">ifcb_dir</span><span class="p">,</span><span class="s1">'test'</span><span class="p">)</span>
|
||
|
||
<span class="k">if</span> <span class="n">for_model_selection</span><span class="p">:</span>
|
||
<span class="c1"># In this case, return 70% of training data as the training set and 30% as the test set</span>
|
||
<span class="n">samples</span> <span class="o">=</span> <span class="n">get_sample_list</span><span class="p">(</span><span class="n">train_samples_path</span><span class="p">)</span>
|
||
<span class="n">train</span><span class="p">,</span> <span class="n">test</span> <span class="o">=</span> <span class="n">generate_modelselection_split</span><span class="p">(</span><span class="n">samples</span><span class="p">,</span> <span class="n">split</span><span class="o">=</span><span class="mf">0.3</span><span class="p">)</span>
|
||
<span class="n">train_gen</span> <span class="o">=</span> <span class="n">IFCBTrainSamplesFromDir</span><span class="p">(</span><span class="n">path_dir</span><span class="o">=</span><span class="n">train_samples_path</span><span class="p">,</span> <span class="n">classes</span><span class="o">=</span><span class="n">classes</span><span class="p">,</span> <span class="n">samples</span><span class="o">=</span><span class="n">train</span><span class="p">)</span>
|
||
|
||
<span class="c1"># Test prevalence is computed from class labels</span>
|
||
<span class="n">test_gen</span> <span class="o">=</span> <span class="n">IFCBTestSamples</span><span class="p">(</span><span class="n">path_dir</span><span class="o">=</span><span class="n">train_samples_path</span><span class="p">,</span> <span class="n">test_prevalences</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">samples</span><span class="o">=</span><span class="n">test</span><span class="p">,</span> <span class="n">classes</span><span class="o">=</span><span class="n">classes</span><span class="p">)</span>
|
||
<span class="k">else</span><span class="p">:</span>
|
||
<span class="c1"># In this case, we use all training samples as the training set and the test samples as the test set</span>
|
||
<span class="n">train_gen</span> <span class="o">=</span> <span class="n">IFCBTrainSamplesFromDir</span><span class="p">(</span><span class="n">path_dir</span><span class="o">=</span><span class="n">train_samples_path</span><span class="p">,</span> <span class="n">classes</span><span class="o">=</span><span class="n">classes</span><span class="p">)</span>
|
||
<span class="n">test_gen</span> <span class="o">=</span> <span class="n">IFCBTestSamples</span><span class="p">(</span><span class="n">path_dir</span><span class="o">=</span><span class="n">test_samples_path</span><span class="p">,</span> <span class="n">test_prevalences</span><span class="o">=</span><span class="n">test_true_prev</span><span class="p">)</span>
|
||
|
||
<span class="c1"># In the case the user wants it, join all the train samples in one LabelledCollection</span>
|
||
<span class="k">if</span> <span class="n">single_sample_train</span><span class="p">:</span>
|
||
<span class="n">train</span> <span class="o">=</span> <span class="n">LabelledCollection</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">lc</span> <span class="k">for</span> <span class="n">lc</span> <span class="ow">in</span> <span class="n">train_gen</span><span class="p">()])</span>
|
||
<span class="k">return</span> <span class="n">train</span><span class="p">,</span> <span class="n">test_gen</span>
|
||
<span class="k">else</span><span class="p">:</span>
|
||
<span class="k">return</span> <span class="n">train_gen</span><span class="p">,</span> <span class="n">test_gen</span></div>
|
||
|
||
</pre></div>
|
||
|
||
</div>
|
||
</div>
|
||
<footer>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<p>© Copyright 2024, Alejandro Moreo.</p>
|
||
</div>
|
||
|
||
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
|
||
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
|
||
provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |