forked from moreo/QuaPy
testing baselines for lequa
This commit is contained in:
parent
1a3755eb58
commit
7468519495
|
@ -1,11 +1,6 @@
|
|||
2. tablas?
|
||||
3. fetch dataset (download, unzip, etc.)
|
||||
4. model selection
|
||||
5. plots
|
||||
8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio
|
||||
9. Experimentar con vectores densos (PCA sobre tfidf por ejemplo)
|
||||
10. Si cambiamos el formato de los samples (por ejemplo, en lugar de svmlight con .txt a PCA con .dat) hay que cambiar
|
||||
cosas en el código. Está escrito varias veces un glob(*.txt)
|
||||
11. Quitar las categorias como columnas de los ficheros de prevalences
|
||||
12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc)
|
||||
13. repair doc of GridSearchQ
|
||||
|
|
|
@ -2,13 +2,14 @@ import argparse
|
|||
import pickle
|
||||
from sklearn.linear_model import LogisticRegression as LR
|
||||
from quapy.method.aggregative import *
|
||||
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
||||
import quapy.functional as F
|
||||
from data import *
|
||||
import os
|
||||
import constants
|
||||
|
||||
|
||||
# LeQua official baselines for task T1B (Multiclass/Vector)
|
||||
# LeQua official baselines for task T1A (Binary/Vector) and T1B (Multiclass/Vector)
|
||||
# =========================================================
|
||||
|
||||
def baselines():
|
||||
|
@ -17,7 +18,8 @@ def baselines():
|
|||
yield PCC(LR(n_jobs=-1)), "PCC"
|
||||
yield PACC(LR(n_jobs=-1)), "PACC"
|
||||
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
||||
yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
||||
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
||||
# yield MLPE(), "MLPE"
|
||||
|
||||
|
||||
def main(args):
|
||||
|
@ -30,7 +32,7 @@ def main(args):
|
|||
|
||||
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
||||
|
||||
train = LabelledCollection.load(path_train, load_binary_vectors)
|
||||
train = LabelledCollection.load(path_train, load_vector_documents)
|
||||
nF = train.instances.shape[1]
|
||||
|
||||
print(f'number of classes: {len(train.classes_)}')
|
||||
|
@ -38,13 +40,19 @@ def main(args):
|
|||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||
print(f'training matrix shape: {train.instances.shape}')
|
||||
|
||||
# param_grid = {
|
||||
# 'C': np.logspace(-3, 3, 7),
|
||||
# 'class_weight': ['balanced', None]
|
||||
# }
|
||||
|
||||
param_grid = {
|
||||
'C': np.logspace(-3,3,7),
|
||||
'class_weight': ['balanced', None]
|
||||
'C': [1],
|
||||
'class_weight': ['balanced']
|
||||
}
|
||||
|
||||
def gen_samples():
|
||||
return gen_load_samples_T1(path_dev_vectors, nF, ground_truth_path=path_dev_prevs, return_id=False)
|
||||
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
||||
load_fn=load_vector_documents, nF=nF)
|
||||
|
||||
for quantifier, q_name in baselines():
|
||||
print(f'{q_name}: Model selection')
|
||||
|
|
|
@ -13,7 +13,7 @@ SAMPLE_SIZE={
|
|||
'T1A': T1A_SAMPLE_SIZE,
|
||||
'T1B': T1B_SAMPLE_SIZE,
|
||||
'T2A': T2A_SAMPLE_SIZE,
|
||||
'T2A': T2B_SAMPLE_SIZE
|
||||
'T2B': T2B_SAMPLE_SIZE
|
||||
}
|
||||
|
||||
ERROR_TOL = 1E-3
|
||||
|
|
|
@ -12,17 +12,6 @@ from glob import glob
|
|||
import constants
|
||||
|
||||
|
||||
# def load_binary_raw_document(path):
|
||||
# documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
|
||||
# labels = np.asarray(labels)
|
||||
# labels[np.logical_or(labels == 1, labels == 2)] = 0
|
||||
# labels[np.logical_or(labels == 4, labels == 5)] = 1
|
||||
# return documents, labels
|
||||
|
||||
|
||||
# def load_multiclass_raw_document(path):
|
||||
# return qp.data.from_text(path, verbose=0, class2int=False)
|
||||
|
||||
def load_category_map(path):
|
||||
cat2code = {}
|
||||
with open(path, 'rt') as fin:
|
||||
|
@ -33,7 +22,19 @@ def load_category_map(path):
|
|||
return cat2code, code2cat
|
||||
|
||||
|
||||
def load_binary_vectors(path, nF=None):
|
||||
def load_raw_documents(path):
|
||||
return qp.data.from_text(path, verbose=0, class2int=True)
|
||||
|
||||
|
||||
def load_raw_unlabelled_documents(path, vectorizer=None):
|
||||
with open(path, 'rt', encoding='utf-8') as file:
|
||||
documents = [d.strip() for d in file.readlines()]
|
||||
if vectorizer:
|
||||
documents = vectorizer.transform(documents)
|
||||
return documents, None
|
||||
|
||||
|
||||
def load_vector_documents(path, nF=None):
|
||||
X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF)
|
||||
y = y.astype(int)
|
||||
return X, y
|
||||
|
@ -53,13 +54,13 @@ def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn,
|
|||
yield (id, sample) if return_id else sample
|
||||
|
||||
|
||||
def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_id=True):
|
||||
def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, load_fn=load_vector_documents, **load_kwargs):
|
||||
if ground_truth_path is None:
|
||||
# the generator function returns tuples (filename:str, sample:csr_matrix)
|
||||
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_binary_vectors, nF=nF)
|
||||
# the generator function returns tuples (docid:str, sample:csr_matrix or str)
|
||||
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs)
|
||||
else:
|
||||
# the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
|
||||
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_binary_vectors, nF=nF)
|
||||
# the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
|
||||
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs)
|
||||
for r in gen_fn:
|
||||
yield r
|
||||
|
||||
|
@ -75,16 +76,6 @@ def genSVD_load_samples_T1(load_fn, path_dir:str, nF:int, ground_truth_path:str
|
|||
yield r
|
||||
|
||||
|
||||
def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):
|
||||
# for ... : yield
|
||||
pass
|
||||
|
||||
|
||||
def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None):
|
||||
# for ... : yield
|
||||
pass
|
||||
|
||||
|
||||
class ResultSubmission:
|
||||
|
||||
def __init__(self):
|
||||
|
|
|
@ -5,7 +5,7 @@ import constants
|
|||
import os
|
||||
import pickle
|
||||
from tqdm import tqdm
|
||||
from data import gen_load_samples_T1
|
||||
from data import gen_load_samples
|
||||
from glob import glob
|
||||
import constants
|
||||
|
||||
|
@ -27,7 +27,7 @@ def main(args):
|
|||
|
||||
# predictions
|
||||
predictions = ResultSubmission()
|
||||
for sampleid, sample in tqdm(gen_load_samples_T1(args.samples, args.nf), desc='predicting', total=nsamples):
|
||||
for sampleid, sample in tqdm(gen_load_samples(args.samples, args.nf), desc='predicting', total=nsamples):
|
||||
predictions.add(sampleid, model.quantify(sample))
|
||||
|
||||
# saving
|
||||
|
|
|
@ -941,8 +941,6 @@
|
|||
<li><a href="quapy.data.html#quapy.data.base.LabelledCollection.sampling_from_index">sampling_from_index() (quapy.data.base.LabelledCollection method)</a>
|
||||
</li>
|
||||
<li><a href="quapy.data.html#quapy.data.base.LabelledCollection.sampling_index">sampling_index() (quapy.data.base.LabelledCollection method)</a>
|
||||
</li>
|
||||
<li><a href="quapy.html#quapy.plot.save_or_show">save_or_show() (in module quapy.plot)</a>
|
||||
</li>
|
||||
<li><a href="quapy.html#quapy.util.save_text_file">save_text_file() (in module quapy.util)</a>
|
||||
</li>
|
||||
|
|
Binary file not shown.
|
@ -721,12 +721,21 @@ being ignored, a TimeoutError exception is raised. If -1 (default) then no time
|
|||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.best_model">
|
||||
<span class="sig-name descname"><span class="pre">best_model</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.best_model" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Returns the best model found after calling the <a class="reference internal" href="#quapy.model_selection.GridSearchQ.fit" title="quapy.model_selection.GridSearchQ.fit"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a> method, i.e., the one trained on the combination
|
||||
of hyper-parameters that minimized the error function.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>a trained quantifier</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py property">
|
||||
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.classes_">
|
||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.classes_" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Classes on which the quantifier has been trained on.
|
||||
:return: a ndarray of shape <cite>(n_classes)</cite> with the class identifiers</p>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.fit">
|
||||
|
@ -743,6 +752,9 @@ being ignored, a TimeoutError exception is raised. If -1 (default) then no time
|
|||
a float in [0,1] indicating the proportion of labelled data to extract from the training set</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>self</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
|
@ -763,11 +775,15 @@ a float in [0,1] indicating the proportion of labelled data to extract from the
|
|||
<dl class="py method">
|
||||
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.quantify">
|
||||
<span class="sig-name descname"><span class="pre">quantify</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.quantify" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Estimate class prevalence values</p>
|
||||
<dd><p>Estimate class prevalence values using the best model found after calling the <a class="reference internal" href="#quapy.model_selection.GridSearchQ.fit" title="quapy.model_selection.GridSearchQ.fit"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a> method.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>instances</strong> – sample contanining the instances</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>a ndarray of shape <cite>(n_classes)</cite> with class prevalence estimates as according to the best model found
|
||||
by the model selection process.</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
|
@ -790,7 +806,9 @@ a float in [0,1] indicating the proportion of labelled data to extract from the
|
|||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.plot.binary_bias_bins">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">binary_bias_bins</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">method_names</span></em>, <em class="sig-param"><span class="pre">true_prevs</span></em>, <em class="sig-param"><span class="pre">estim_prevs</span></em>, <em class="sig-param"><span class="pre">pos_class=1</span></em>, <em class="sig-param"><span class="pre">title=None</span></em>, <em class="sig-param"><span class="pre">nbins=5</span></em>, <em class="sig-param"><span class="pre">colormap=<matplotlib.colors.ListedColormap</span> <span class="pre">object></span></em>, <em class="sig-param"><span class="pre">vertical_xticks=False</span></em>, <em class="sig-param"><span class="pre">legend=True</span></em>, <em class="sig-param"><span class="pre">savepath=None</span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.binary_bias_bins" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><dl class="field-list simple">
|
||||
<dd><p>Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value)
|
||||
for different bins of (true) prevalence of the positive classs, for each quantification method.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>method_names</strong> – array-like with the method names for each experiment</p></li>
|
||||
|
@ -802,7 +820,7 @@ for each experiment</p></li>
|
|||
<li><p><strong>title</strong> – the title to be displayed in the plot</p></li>
|
||||
<li><p><strong>nbins</strong> – number of bins</p></li>
|
||||
<li><p><strong>colormap</strong> – the matplotlib colormap to use (default cm.tab10)</p></li>
|
||||
<li><p><strong>vertical_xticks</strong> – </p></li>
|
||||
<li><p><strong>vertical_xticks</strong> – whether or not to add secondary grid (default is False)</p></li>
|
||||
<li><p><strong>legend</strong> – whether or not to display the legend (default is True)</p></li>
|
||||
<li><p><strong>savepath</strong> – path where to save the plot. If not indicated (as default), the plot is shown.</p></li>
|
||||
</ul>
|
||||
|
@ -865,17 +883,77 @@ listed in the legend and associated with matplotlib colors).</p></li>
|
|||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.plot.brokenbar_supremacy_by_drift">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">brokenbar_supremacy_by_drift</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">true_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estim_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tr_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_bins</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">binning</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'isomerous'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">x_error</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y_error</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ttest_alpha</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.005</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tail_density_threshold</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.005</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method_order</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">savepath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.brokenbar_supremacy_by_drift" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Displays (only) the top performing methods for different regions of the train-test shift in form of a broken
|
||||
bar chart, in which each method has bars only for those regions in which either one of the following conditions
|
||||
hold: (i) it is the best method (in average) for the bin, or (ii) it is not statistically significantly different
|
||||
(in average) as according to a two-sided t-test on independent samples at confidence <cite>ttest_alpha</cite>.
|
||||
The binning can be made “isometric” (same size), or “isomerous” (same number of experiments – default). A second
|
||||
plot is displayed on top, that displays the distribution of experiments for each bin (when binning=”isometric”) or
|
||||
the percentiles points of the distribution (when binning=”isomerous”).</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>method_names</strong> – array-like with the method names for each experiment</p></li>
|
||||
<li><p><strong>true_prevs</strong> – array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||
each experiment</p></li>
|
||||
<li><p><strong>estim_prevs</strong> – array-like with the estimated prevalence values (each being a ndarray with n_classes components)
|
||||
for each experiment</p></li>
|
||||
<li><p><strong>tr_prevs</strong> – training prevalence of each experiment</p></li>
|
||||
<li><p><strong>n_bins</strong> – number of bins in which the y-axis is to be divided (default is 20)</p></li>
|
||||
<li><p><strong>binning</strong> – type of binning, either “isomerous” (default) or “isometric”</p></li>
|
||||
<li><p><strong>x_error</strong> – a string representing the name of an error function (as defined in <cite>quapy.error</cite>) to be used for
|
||||
measuring the amount of train-test shift (default is “ae”)</p></li>
|
||||
<li><p><strong>y_error</strong> – a string representing the name of an error function (as defined in <cite>quapy.error</cite>) to be used for
|
||||
measuring the amount of error in the prevalence estimations (default is “ae”)</p></li>
|
||||
<li><p><strong>ttest_alpha</strong> – the confidence interval above which a p-value (two-sided t-test on independent samples) is
|
||||
to be considered as an indicator that the two means are not statistically significantly different. Default is
|
||||
0.005, meaning that a <cite>p-value > 0.005</cite> indicates the two methods involved are to be considered similar</p></li>
|
||||
<li><p><strong>tail_density_threshold</strong> – sets a threshold on the density of experiments (over the total number of experiments)
|
||||
below which a bin in the tail (i.e., the right-most ones) will be discarded. This is in order to avoid some
|
||||
bins to be shown for train-test outliers.</p></li>
|
||||
<li><p><strong>method_order</strong> – if indicated (default is None), imposes the order in which the methods are processed (i.e.,
|
||||
listed in the legend and associated with matplotlib colors).</p></li>
|
||||
<li><p><strong>savepath</strong> – path where to save the plot. If not indicated (as default), the plot is shown.</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.plot.error_by_drift">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">error_by_drift</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">true_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estim_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tr_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_bins</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">error_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">show_std</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">show_density</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logscale</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'Quantification</span> <span class="pre">error</span> <span class="pre">as</span> <span class="pre">a</span> <span class="pre">function</span> <span class="pre">of</span> <span class="pre">distribution</span> <span class="pre">shift'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">savepath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">vlines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method_order</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.error_by_drift" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.plot.save_or_show">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">save_or_show</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">savepath</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.save_or_show" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">error_by_drift</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">true_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estim_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tr_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_bins</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">error_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">show_std</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">show_density</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logscale</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'Quantification</span> <span class="pre">error</span> <span class="pre">as</span> <span class="pre">a</span> <span class="pre">function</span> <span class="pre">of</span> <span class="pre">distribution</span> <span class="pre">shift'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">vlines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method_order</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">savepath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.error_by_drift" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Plots the error (along the x-axis, as measured in terms of <cite>error_name</cite>) as a function of the train-test shift
|
||||
(along the y-axis, as measured in terms of <a class="reference internal" href="#quapy.error.ae" title="quapy.error.ae"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.ae()</span></code></a>). This plot is useful especially for multiclass
|
||||
problems, in which “diagonal plots” may be cumbersone, and in order to gain understanding about how methods
|
||||
fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the
|
||||
high-shift regime).</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>method_names</strong> – array-like with the method names for each experiment</p></li>
|
||||
<li><p><strong>true_prevs</strong> – array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||
each experiment</p></li>
|
||||
<li><p><strong>estim_prevs</strong> – array-like with the estimated prevalence values (each being a ndarray with n_classes components)
|
||||
for each experiment</p></li>
|
||||
<li><p><strong>tr_prevs</strong> – training prevalence of each experiment</p></li>
|
||||
<li><p><strong>n_bins</strong> – number of bins in which the y-axis is to be divided (default is 20)</p></li>
|
||||
<li><p><strong>error_name</strong> – a string representing the name of an error function (as defined in <cite>quapy.error</cite>, default is “ae”)</p></li>
|
||||
<li><p><strong>show_std</strong> – whether or not to show standard deviations as color bands (default is False)</p></li>
|
||||
<li><p><strong>show_density</strong> – whether or not to display the distribution of experiments for each bin (default is True)</p></li>
|
||||
<li><p><strong>logscale</strong> – whether or not to log-scale the y-error measure (default is False)</p></li>
|
||||
<li><p><strong>title</strong> – title of the plot (default is “Quantification error as a function of distribution shift”)</p></li>
|
||||
<li><p><strong>vlines</strong> – array-like list of values (default is None). If indicated, highlights some regions of the space
|
||||
using vertical dotted lines.</p></li>
|
||||
<li><p><strong>method_order</strong> – if indicated (default is None), imposes the order in which the methods are processed (i.e.,
|
||||
listed in the legend and associated with matplotlib colors).</p></li>
|
||||
<li><p><strong>savepath</strong> – path where to save the plot. If not indicated (as default), the plot is shown.</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
</section>
|
||||
<section id="module-quapy.util">
|
||||
|
@ -884,82 +962,184 @@ listed in the legend and associated with matplotlib colors).</p></li>
|
|||
<dt class="sig sig-object py" id="quapy.util.EarlyStop">
|
||||
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">EarlyStop</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">patience</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lower_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.EarlyStop" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||||
<p>A class implementing the early-stopping condition typically used for training neural networks.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>patience</strong> – the number of (consecutive) times that a monitored evaluation metric (typically obtaind in a</p>
|
||||
</dd>
|
||||
</dl>
|
||||
<p>held-out validation split) can be found to be worse than the best one obtained so far, before flagging the
|
||||
stopping condition. An instance of this class is <cite>callable</cite>, and is to be used as follows:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">earlystop</span> <span class="o">=</span> <span class="n">EarlyStop</span><span class="p">(</span><span class="n">patience</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">lower_is_better</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="p">(</span><span class="mf">0.9</span><span class="p">,</span> <span class="n">epoch</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="p">(</span><span class="mf">0.7</span><span class="p">,</span> <span class="n">epoch</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">IMPROVED</span> <span class="c1"># is True</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">epoch</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">STOP</span> <span class="c1"># is False (patience=1)</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">epoch</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">STOP</span> <span class="c1"># is True (patience=0)</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">best_epoch</span> <span class="c1"># is 1</span>
|
||||
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">best_score</span> <span class="c1"># is 0.7</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>lower_is_better</strong> – if True (default) the metric is to be minimized.</p>
|
||||
</dd>
|
||||
<dt class="field-even">Variables</dt>
|
||||
<dd class="field-even"><ul class="simple">
|
||||
<li><p><strong>best_score</strong> – keeps track of the best value seen so far</p></li>
|
||||
<li><p><strong>best_epoch</strong> – keeps track of the epoch in which the best score was set</p></li>
|
||||
<li><p><strong>STOP</strong> – flag (boolean) indicating the stopping condition</p></li>
|
||||
<li><p><strong>IMPROVED</strong> – flag (boolean) indicating whether there was an improvement in the last call</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.create_if_not_exist">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">create_if_not_exist</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.create_if_not_exist" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>An alias to <cite>os.makedirs(path, exist_ok=True)</cite> that also returns the path. This is useful in cases like, e.g.:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">path</span> <span class="o">=</span> <span class="n">create_if_not_exist</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="nb">dir</span><span class="p">,</span> <span class="n">subdir</span><span class="p">,</span> <span class="n">anotherdir</span><span class="p">))</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>path</strong> – path to create</p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the path itself</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.create_parent_dir">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">create_parent_dir</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.create_parent_dir" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Creates the parent dir (if any) of a given path, if not exists. E.g., for <cite>./path/to/file.txt</cite>, the path <cite>./path/to</cite>
|
||||
is created.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>path</strong> – the path</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.download_file">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">download_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">archive_filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.download_file" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Downloads a file from a url</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>url</strong> – the url</p></li>
|
||||
<li><p><strong>archive_filename</strong> – destination filename</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.download_file_if_not_exists">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">download_file_if_not_exists</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">archive_path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.download_file_if_not_exists" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">download_file_if_not_exists</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">archive_filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.download_file_if_not_exists" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Dowloads a function (using <a class="reference internal" href="#quapy.util.download_file" title="quapy.util.download_file"><code class="xref py py-meth docutils literal notranslate"><span class="pre">download_file()</span></code></a>) if the file does not exist.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>url</strong> – the url</p></li>
|
||||
<li><p><strong>archive_filename</strong> – destination filename</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.get_quapy_home">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">get_quapy_home</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.get_quapy_home" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Gets the home directory of QuaPy, i.e., the directory where QuaPy saves permanent data, such as dowloaded datasets.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Returns</dt>
|
||||
<dd class="field-odd"><p>a string representing the path</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.map_parallel">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">map_parallel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.map_parallel" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
|
||||
func is applied in two parallel processes to args[0:50] and to args[50:99]</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>func</strong> – function to be parallelized</p></li>
|
||||
<li><p><strong>args</strong> – array-like of arguments to be passed to the function in different parallel calls</p></li>
|
||||
<li><p><strong>n_jobs</strong> – the number of workers</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.parallel">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">parallel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.parallel" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>A wrapper of multiprocessing:
|
||||
Parallel(n_jobs=n_jobs)(</p>
|
||||
<blockquote>
|
||||
<div><p>delayed(func)(args_i) for args_i in args</p>
|
||||
</div></blockquote>
|
||||
<p>)
|
||||
that takes the quapy.environ variable as input silently</p>
|
||||
<dd><p>A wrapper of multiprocessing:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">Parallel</span><span class="p">(</span><span class="n">n_jobs</span><span class="o">=</span><span class="n">n_jobs</span><span class="p">)(</span>
|
||||
<span class="gp">>>> </span> <span class="n">delayed</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span><span class="n">args_i</span><span class="p">)</span> <span class="k">for</span> <span class="n">args_i</span> <span class="ow">in</span> <span class="n">args</span>
|
||||
<span class="gp">>>> </span><span class="p">)</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>that takes the <cite>quapy.environ</cite> variable as input silently</p>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.pickled_resource">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">pickled_resource</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pickle_path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">generation_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.pickled_resource" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Allows for fast reuse of resources that are generated only once by calling generation_func(<a href="#id1"><span class="problematic" id="id2">*</span></a>args). The next times
|
||||
this function is invoked, it loads the pickled resource. Example:
|
||||
def some_array(n):</p>
|
||||
<blockquote>
|
||||
<div><p>return np.random.rand(n)</p>
|
||||
</div></blockquote>
|
||||
<p>pickled_resource(‘./my_array.pkl’, some_array, 10) # the resource does not exist: it is created by some_array(10)
|
||||
pickled_resource(‘./my_array.pkl’, some_array, 10) # the resource exists: it is loaded from ‘./my_array.pkl’
|
||||
:param pickle_path: the path where to save (first time) and load (next times) the resource
|
||||
:param generation_func: the function that generates the resource, in case it does not exist in pickle_path
|
||||
:param args: any arg that generation_func uses for generating the resources
|
||||
:return: the resource</p>
|
||||
this function is invoked, it loads the pickled resource. Example:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="k">def</span> <span class="nf">some_array</span><span class="p">(</span><span class="n">n</span><span class="p">):</span> <span class="c1"># a mock resource created with one parameter (`n`)</span>
|
||||
<span class="gp">>>> </span> <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">n</span><span class="p">)</span>
|
||||
<span class="gp">>>> </span><span class="n">pickled_resource</span><span class="p">(</span><span class="s1">'./my_array.pkl'</span><span class="p">,</span> <span class="n">some_array</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="c1"># the resource does not exist: it is created by calling some_array(10)</span>
|
||||
<span class="gp">>>> </span><span class="n">pickled_resource</span><span class="p">(</span><span class="s1">'./my_array.pkl'</span><span class="p">,</span> <span class="n">some_array</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="c1"># the resource exists; it is loaded from './my_array.pkl'</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>pickle_path</strong> – the path where to save (first time) and load (next times) the resource</p></li>
|
||||
<li><p><strong>generation_func</strong> – the function that generates the resource, in case it does not exist in pickle_path</p></li>
|
||||
<li><p><strong>args</strong> – any arg that generation_func uses for generating the resources</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
<dt class="field-even">Returns</dt>
|
||||
<dd class="field-even"><p>the resource</p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.save_text_file">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">save_text_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">text</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.save_text_file" title="Permalink to this definition">¶</a></dt>
|
||||
<dd></dd></dl>
|
||||
<dd><p>Saves a text file to disk, given its full path, and creates the parent directory if missing.</p>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><ul class="simple">
|
||||
<li><p><strong>path</strong> – path where to save the path.</p></li>
|
||||
<li><p><strong>text</strong> – text to save.</p></li>
|
||||
</ul>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="py function">
|
||||
<dt class="sig sig-object py" id="quapy.util.temp_seed">
|
||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">temp_seed</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.temp_seed" title="Permalink to this definition">¶</a></dt>
|
||||
<dd><p>Can be used in a “with” context to set a temporal seed without modifying the outer numpy’s current state. E.g.:
|
||||
with temp_seed(random_seed):</p>
|
||||
<blockquote>
|
||||
<div><p># do any computation depending on np.random functionality</p>
|
||||
</div></blockquote>
|
||||
<dd><p>Can be used in a “with” context to set a temporal seed without modifying the outer numpy’s current state. E.g.:</p>
|
||||
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="k">with</span> <span class="n">temp_seed</span><span class="p">(</span><span class="n">random_seed</span><span class="p">):</span>
|
||||
<span class="gp">>>> </span> <span class="k">pass</span> <span class="c1"># do any computation depending on np.random functionality</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters</dt>
|
||||
<dd class="field-odd"><p><strong>seed</strong> – the seed to set within the “with” context</p>
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -18,13 +18,16 @@ def from_text(path, encoding='utf-8', verbose=1, class2int=True):
|
|||
for line in file:
|
||||
line = line.strip()
|
||||
if line:
|
||||
label, sentence = line.split('\t')
|
||||
sentence = sentence.strip()
|
||||
if class2int:
|
||||
label = int(label)
|
||||
if sentence:
|
||||
all_sentences.append(sentence)
|
||||
all_labels.append(label)
|
||||
try:
|
||||
label, sentence = line.split('\t')
|
||||
sentence = sentence.strip()
|
||||
if class2int:
|
||||
label = int(label)
|
||||
if sentence:
|
||||
all_sentences.append(sentence)
|
||||
all_labels.append(label)
|
||||
except ValueError:
|
||||
print(f'format error in {line}')
|
||||
return all_sentences, all_labels
|
||||
|
||||
|
||||
|
|
|
@ -5,6 +5,25 @@ import numpy as np
|
|||
|
||||
|
||||
def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, return_constrained_dim=False):
|
||||
"""
|
||||
Generates vectors of prevalence values artificially drawn from an exhaustive grid of prevalence values. The
|
||||
number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
|
||||
`n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only
|
||||
valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
|
||||
valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be
|
||||
implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained
|
||||
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to 1).
|
||||
|
||||
:param dimensions: the number of classes
|
||||
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the grid
|
||||
(default is 21)
|
||||
:param repeat: number of copies for each valid prevalence vector (default is 1)
|
||||
:param return_constrained_dim: set to True to return all dimensions, or to False (default) for ommitting the
|
||||
constrained dimension
|
||||
:return: an ndarray of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape `(n, dimensions-1)`
|
||||
if `return_constrained_dim=False`, where `n` is the number of valid combinations found in the grid multiplied
|
||||
by `repeat`
|
||||
"""
|
||||
s = np.linspace(0., 1., n_prevalences, endpoint=True)
|
||||
s = [s] * (dimensions - 1)
|
||||
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p)<=1]
|
||||
|
@ -18,9 +37,10 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur
|
|||
|
||||
def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
||||
"""
|
||||
Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
|
||||
and with the limits smoothed, i.e.:
|
||||
Produces a uniformly separated values of prevalence. By default, produces an array of 21 prevalence values, with
|
||||
step 0.05 and with the limits smoothed, i.e.:
|
||||
[0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
|
||||
|
||||
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
|
||||
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
|
||||
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
|
||||
|
@ -36,12 +56,20 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
|||
return p
|
||||
|
||||
|
||||
def prevalence_from_labels(labels, classes_):
|
||||
def prevalence_from_labels(labels, classes):
|
||||
"""
|
||||
Computed the prevalence values from a vector of labels.
|
||||
|
||||
:param labels: array-like of shape `(n_instances)` with the label for each instance
|
||||
:param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when
|
||||
some classes have no examples.
|
||||
:return: an ndarray of shape `(len(classes))` with the class prevalence values
|
||||
"""
|
||||
if labels.ndim != 1:
|
||||
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
||||
unique, counts = np.unique(labels, return_counts=True)
|
||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||
prevalences = np.asarray([by_class[class_] for class_ in classes_], dtype=np.float)
|
||||
prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=np.float)
|
||||
prevalences /= prevalences.sum()
|
||||
return prevalences
|
||||
|
||||
|
|
|
@ -151,9 +151,11 @@ class GridSearchQ(BaseQuantifier):
|
|||
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
|
||||
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
|
||||
the error metric.
|
||||
|
||||
:param training: the training set on which to optimize the hyperparameters
|
||||
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
||||
a float in [0,1] indicating the proportion of labelled data to extract from the training set
|
||||
:return: self
|
||||
"""
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
|
@ -213,15 +215,21 @@ class GridSearchQ(BaseQuantifier):
|
|||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
"""Estimate class prevalence values
|
||||
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
|
||||
|
||||
:param instances: sample contanining the instances
|
||||
:return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found
|
||||
by the model selection process.
|
||||
"""
|
||||
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
||||
return self.best_model().quantify(instances)
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
"""
|
||||
Classes on which the quantifier has been trained on.
|
||||
:return: a ndarray of shape `(n_classes)` with the class identifiers
|
||||
"""
|
||||
return self.best_model().classes_
|
||||
|
||||
def set_params(self, **parameters):
|
||||
|
@ -240,6 +248,12 @@ class GridSearchQ(BaseQuantifier):
|
|||
return self.param_grid
|
||||
|
||||
def best_model(self):
|
||||
"""
|
||||
Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination
|
||||
of hyper-parameters that minimized the error function.
|
||||
|
||||
:return: a trained quantifier
|
||||
"""
|
||||
if hasattr(self, 'best_model_'):
|
||||
return self.best_model_
|
||||
raise ValueError('best_model called before fit')
|
||||
|
|
130
quapy/plot.py
130
quapy/plot.py
|
@ -82,7 +82,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
|
|||
bbox_to_anchor=(1, -0.5),
|
||||
ncol=(len(method_names)+1)//2)
|
||||
|
||||
save_or_show(savepath)
|
||||
_save_or_show(savepath)
|
||||
|
||||
|
||||
def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
|
||||
|
@ -116,12 +116,14 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
|
|||
plt.xticks(rotation=45)
|
||||
ax.set(ylabel='error bias', title=title)
|
||||
|
||||
save_or_show(savepath)
|
||||
_save_or_show(savepath)
|
||||
|
||||
|
||||
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
|
||||
vertical_xticks=False, legend=True, savepath=None):
|
||||
"""
|
||||
Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value)
|
||||
for different bins of (true) prevalence of the positive classs, for each quantification method.
|
||||
|
||||
:param method_names: array-like with the method names for each experiment
|
||||
:param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||
|
@ -132,7 +134,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
|||
:param title: the title to be displayed in the plot
|
||||
:param nbins: number of bins
|
||||
:param colormap: the matplotlib colormap to use (default cm.tab10)
|
||||
:param vertical_xticks:
|
||||
:param vertical_xticks: whether or not to add secondary grid (default is False)
|
||||
:param legend: whether or not to display the legend (default is True)
|
||||
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
||||
"""
|
||||
|
@ -202,39 +204,44 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
|||
|
||||
# x-axis and y-axis labels and limits
|
||||
ax.set(xlabel='prevalence', ylabel='error bias', title=title)
|
||||
# ax.set_ylim(-1, 1)
|
||||
ax.set_xlim(0, 1)
|
||||
|
||||
save_or_show(savepath)
|
||||
_save_or_show(savepath)
|
||||
|
||||
|
||||
def _merge(method_names, true_prevs, estim_prevs):
|
||||
ndims = true_prevs[0].shape[1]
|
||||
data = defaultdict(lambda: {'true': np.empty(shape=(0, ndims)), 'estim': np.empty(shape=(0, ndims))})
|
||||
method_order=[]
|
||||
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||
data[method]['true'] = np.concatenate([data[method]['true'], true_prev])
|
||||
data[method]['estim'] = np.concatenate([data[method]['estim'], estim_prev])
|
||||
if method not in method_order:
|
||||
method_order.append(method)
|
||||
true_prevs_ = [data[m]['true'] for m in method_order]
|
||||
estim_prevs_ = [data[m]['estim'] for m in method_order]
|
||||
return method_order, true_prevs_, estim_prevs_
|
||||
|
||||
|
||||
def _set_colors(ax, n_methods):
|
||||
NUM_COLORS = n_methods
|
||||
cm = plt.get_cmap('tab20')
|
||||
ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
|
||||
|
||||
|
||||
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=False,
|
||||
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||
n_bins=20, error_name='ae', show_std=False,
|
||||
show_density=True,
|
||||
logscale=False,
|
||||
title=f'Quantification error as a function of distribution shift',
|
||||
savepath=None,
|
||||
vlines=None,
|
||||
method_order=None):
|
||||
method_order=None,
|
||||
savepath=None):
|
||||
"""
|
||||
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
|
||||
(along the y-axis, as measured in terms of :meth:`quapy.error.ae`). This plot is useful especially for multiclass
|
||||
problems, in which "diagonal plots" may be cumbersone, and in order to gain understanding about how methods
|
||||
fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the
|
||||
high-shift regime).
|
||||
|
||||
:param method_names: array-like with the method names for each experiment
|
||||
:param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||
each experiment
|
||||
:param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
|
||||
for each experiment
|
||||
:param tr_prevs: training prevalence of each experiment
|
||||
:param n_bins: number of bins in which the y-axis is to be divided (default is 20)
|
||||
:param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae")
|
||||
:param show_std: whether or not to show standard deviations as color bands (default is False)
|
||||
:param show_density: whether or not to display the distribution of experiments for each bin (default is True)
|
||||
:param logscale: whether or not to log-scale the y-error measure (default is False)
|
||||
:param title: title of the plot (default is "Quantification error as a function of distribution shift")
|
||||
:param vlines: array-like list of values (default is None). If indicated, highlights some regions of the space
|
||||
using vertical dotted lines.
|
||||
:param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e.,
|
||||
listed in the legend and associated with matplotlib colors).
|
||||
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
||||
"""
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.grid()
|
||||
|
@ -245,7 +252,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, e
|
|||
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
||||
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
||||
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
||||
data = __join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||
|
||||
_set_colors(ax, n_methods=len(method_order))
|
||||
|
||||
|
@ -302,13 +309,46 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, e
|
|||
ax.set_xlim(0, max_x)
|
||||
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||
|
||||
save_or_show(savepath)
|
||||
_save_or_show(savepath)
|
||||
|
||||
|
||||
def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, binning='isomerous',
|
||||
def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||
n_bins=20, binning='isomerous',
|
||||
x_error='ae', y_error='ae', ttest_alpha=0.005, tail_density_threshold=0.005,
|
||||
method_order=None,
|
||||
savepath=None):
|
||||
"""
|
||||
Displays (only) the top performing methods for different regions of the train-test shift in form of a broken
|
||||
bar chart, in which each method has bars only for those regions in which either one of the following conditions
|
||||
hold: (i) it is the best method (in average) for the bin, or (ii) it is not statistically significantly different
|
||||
(in average) as according to a two-sided t-test on independent samples at confidence `ttest_alpha`.
|
||||
The binning can be made "isometric" (same size), or "isomerous" (same number of experiments -- default). A second
|
||||
plot is displayed on top, that displays the distribution of experiments for each bin (when binning="isometric") or
|
||||
the percentiles points of the distribution (when binning="isomerous").
|
||||
|
||||
:param method_names: array-like with the method names for each experiment
|
||||
:param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||
each experiment
|
||||
:param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
|
||||
for each experiment
|
||||
:param tr_prevs: training prevalence of each experiment
|
||||
:param n_bins: number of bins in which the y-axis is to be divided (default is 20)
|
||||
:param binning: type of binning, either "isomerous" (default) or "isometric"
|
||||
:param x_error: a string representing the name of an error function (as defined in `quapy.error`) to be used for
|
||||
measuring the amount of train-test shift (default is "ae")
|
||||
:param y_error: a string representing the name of an error function (as defined in `quapy.error`) to be used for
|
||||
measuring the amount of error in the prevalence estimations (default is "ae")
|
||||
:param ttest_alpha: the confidence interval above which a p-value (two-sided t-test on independent samples) is
|
||||
to be considered as an indicator that the two means are not statistically significantly different. Default is
|
||||
0.005, meaning that a `p-value > 0.005` indicates the two methods involved are to be considered similar
|
||||
:param tail_density_threshold: sets a threshold on the density of experiments (over the total number of experiments)
|
||||
below which a bin in the tail (i.e., the right-most ones) will be discarded. This is in order to avoid some
|
||||
bins to be shown for train-test outliers.
|
||||
:param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e.,
|
||||
listed in the legend and associated with matplotlib colors).
|
||||
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
||||
:return:
|
||||
"""
|
||||
assert binning in ['isomerous', 'isometric'], 'unknown binning type; valid types are "isomerous" and "isometric"'
|
||||
|
||||
x_error = getattr(qp.error, x_error)
|
||||
|
@ -317,7 +357,7 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
|
|||
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
||||
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
||||
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
||||
data = __join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||
|
||||
if binning == 'isomerous':
|
||||
# take bins containing the same amount of examples
|
||||
|
@ -449,10 +489,30 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
|
|||
ax.get_xaxis().set_visible(False)
|
||||
plt.subplots_adjust(wspace=0, hspace=0)
|
||||
|
||||
save_or_show(savepath)
|
||||
_save_or_show(savepath)
|
||||
|
||||
|
||||
def save_or_show(savepath):
|
||||
def _merge(method_names, true_prevs, estim_prevs):
|
||||
ndims = true_prevs[0].shape[1]
|
||||
data = defaultdict(lambda: {'true': np.empty(shape=(0, ndims)), 'estim': np.empty(shape=(0, ndims))})
|
||||
method_order=[]
|
||||
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||
data[method]['true'] = np.concatenate([data[method]['true'], true_prev])
|
||||
data[method]['estim'] = np.concatenate([data[method]['estim'], estim_prev])
|
||||
if method not in method_order:
|
||||
method_order.append(method)
|
||||
true_prevs_ = [data[m]['true'] for m in method_order]
|
||||
estim_prevs_ = [data[m]['estim'] for m in method_order]
|
||||
return method_order, true_prevs_, estim_prevs_
|
||||
|
||||
|
||||
def _set_colors(ax, n_methods):
|
||||
NUM_COLORS = n_methods
|
||||
cm = plt.get_cmap('tab20')
|
||||
ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
|
||||
|
||||
|
||||
def _save_or_show(savepath):
|
||||
# if savepath is specified, then saves the plot in that path; otherwise the plot is shown
|
||||
if savepath is not None:
|
||||
qp.util.create_parent_dir(savepath)
|
||||
|
@ -462,7 +522,7 @@ def save_or_show(savepath):
|
|||
plt.show()
|
||||
|
||||
|
||||
def __join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order):
|
||||
def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order):
|
||||
data = defaultdict(lambda: {'x': np.empty(shape=(0)), 'y': np.empty(shape=(0))})
|
||||
|
||||
if method_order is None:
|
||||
|
|
111
quapy/util.py
111
quapy/util.py
|
@ -23,6 +23,10 @@ def map_parallel(func, args, n_jobs):
|
|||
"""
|
||||
Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
|
||||
func is applied in two parallel processes to args[0:50] and to args[50:99]
|
||||
|
||||
:param func: function to be parallelized
|
||||
:param args: array-like of arguments to be passed to the function in different parallel calls
|
||||
:param n_jobs: the number of workers
|
||||
"""
|
||||
args = np.asarray(args)
|
||||
slices = _get_parallel_slices(len(args), n_jobs)
|
||||
|
@ -35,10 +39,12 @@ def map_parallel(func, args, n_jobs):
|
|||
def parallel(func, args, n_jobs):
|
||||
"""
|
||||
A wrapper of multiprocessing:
|
||||
Parallel(n_jobs=n_jobs)(
|
||||
delayed(func)(args_i) for args_i in args
|
||||
)
|
||||
that takes the quapy.environ variable as input silently
|
||||
|
||||
>>> Parallel(n_jobs=n_jobs)(
|
||||
>>> delayed(func)(args_i) for args_i in args
|
||||
>>> )
|
||||
|
||||
that takes the `quapy.environ` variable as input silently
|
||||
"""
|
||||
def func_dec(environ, *args):
|
||||
qp.environ = environ
|
||||
|
@ -52,8 +58,10 @@ def parallel(func, args, n_jobs):
|
|||
def temp_seed(seed):
|
||||
"""
|
||||
Can be used in a "with" context to set a temporal seed without modifying the outer numpy's current state. E.g.:
|
||||
with temp_seed(random_seed):
|
||||
# do any computation depending on np.random functionality
|
||||
|
||||
>>> with temp_seed(random_seed):
|
||||
>>> pass # do any computation depending on np.random functionality
|
||||
|
||||
:param seed: the seed to set within the "with" context
|
||||
"""
|
||||
state = np.random.get_state()
|
||||
|
@ -65,6 +73,12 @@ def temp_seed(seed):
|
|||
|
||||
|
||||
def download_file(url, archive_filename):
|
||||
"""
|
||||
Downloads a file from a url
|
||||
|
||||
:param url: the url
|
||||
:param archive_filename: destination filename
|
||||
"""
|
||||
def progress(blocknum, bs, size):
|
||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||
|
@ -74,31 +88,62 @@ def download_file(url, archive_filename):
|
|||
print("")
|
||||
|
||||
|
||||
def download_file_if_not_exists(url, archive_path):
|
||||
if os.path.exists(archive_path):
|
||||
def download_file_if_not_exists(url, archive_filename):
|
||||
"""
|
||||
Dowloads a function (using :meth:`download_file`) if the file does not exist.
|
||||
|
||||
:param url: the url
|
||||
:param archive_filename: destination filename
|
||||
"""
|
||||
if os.path.exists(archive_filename):
|
||||
return
|
||||
create_if_not_exist(os.path.dirname(archive_path))
|
||||
download_file(url,archive_path)
|
||||
create_if_not_exist(os.path.dirname(archive_filename))
|
||||
download_file(url, archive_filename)
|
||||
|
||||
|
||||
def create_if_not_exist(path):
|
||||
"""
|
||||
An alias to `os.makedirs(path, exist_ok=True)` that also returns the path. This is useful in cases like, e.g.:
|
||||
|
||||
>>> path = create_if_not_exist(os.path.join(dir, subdir, anotherdir))
|
||||
|
||||
:param path: path to create
|
||||
:return: the path itself
|
||||
"""
|
||||
os.makedirs(path, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
def get_quapy_home():
|
||||
"""
|
||||
Gets the home directory of QuaPy, i.e., the directory where QuaPy saves permanent data, such as dowloaded datasets.
|
||||
|
||||
:return: a string representing the path
|
||||
"""
|
||||
home = os.path.join(str(Path.home()), 'quapy_data')
|
||||
os.makedirs(home, exist_ok=True)
|
||||
return home
|
||||
|
||||
|
||||
def create_parent_dir(path):
|
||||
"""
|
||||
Creates the parent dir (if any) of a given path, if not exists. E.g., for `./path/to/file.txt`, the path `./path/to`
|
||||
is created.
|
||||
|
||||
:param path: the path
|
||||
"""
|
||||
parentdir = Path(path).parent
|
||||
if parentdir:
|
||||
os.makedirs(parentdir, exist_ok=True)
|
||||
|
||||
|
||||
def save_text_file(path, text):
|
||||
"""
|
||||
Saves a text file to disk, given its full path, and creates the parent directory if missing.
|
||||
|
||||
:param path: path where to save the path.
|
||||
:param text: text to save.
|
||||
"""
|
||||
create_parent_dir(path)
|
||||
with open(text, 'wt') as fout:
|
||||
fout.write(text)
|
||||
|
@ -108,10 +153,12 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
|||
"""
|
||||
Allows for fast reuse of resources that are generated only once by calling generation_func(*args). The next times
|
||||
this function is invoked, it loads the pickled resource. Example:
|
||||
def some_array(n):
|
||||
return np.random.rand(n)
|
||||
pickled_resource('./my_array.pkl', some_array, 10) # the resource does not exist: it is created by some_array(10)
|
||||
pickled_resource('./my_array.pkl', some_array, 10) # the resource exists: it is loaded from './my_array.pkl'
|
||||
|
||||
>>> def some_array(n): # a mock resource created with one parameter (`n`)
|
||||
>>> return np.random.rand(n)
|
||||
>>> pickled_resource('./my_array.pkl', some_array, 10) # the resource does not exist: it is created by calling some_array(10)
|
||||
>>> pickled_resource('./my_array.pkl', some_array, 10) # the resource exists; it is loaded from './my_array.pkl'
|
||||
|
||||
:param pickle_path: the path where to save (first time) and load (next times) the resource
|
||||
:param generation_func: the function that generates the resource, in case it does not exist in pickle_path
|
||||
:param args: any arg that generation_func uses for generating the resources
|
||||
|
@ -130,8 +177,36 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
|||
|
||||
|
||||
class EarlyStop:
|
||||
"""
|
||||
A class implementing the early-stopping condition typically used for training neural networks.
|
||||
|
||||
:param patience: the number of (consecutive) times that a monitored evaluation metric (typically obtaind in a
|
||||
held-out validation split) can be found to be worse than the best one obtained so far, before flagging the
|
||||
stopping condition. An instance of this class is `callable`, and is to be used as follows:
|
||||
|
||||
>>> earlystop = EarlyStop(patience=2, lower_is_better=True)
|
||||
>>> earlystop(0.9, epoch=0)
|
||||
>>> earlystop(0.7, epoch=1)
|
||||
>>> earlystop.IMPROVED # is True
|
||||
>>> earlystop(1.0, epoch=2)
|
||||
>>> earlystop.STOP # is False (patience=1)
|
||||
>>> earlystop(1.0, epoch=3)
|
||||
>>> earlystop.STOP # is True (patience=0)
|
||||
>>> earlystop.best_epoch # is 1
|
||||
>>> earlystop.best_score # is 0.7
|
||||
|
||||
|
||||
:param lower_is_better: if True (default) the metric is to be minimized.
|
||||
|
||||
:ivar best_score: keeps track of the best value seen so far
|
||||
:ivar best_epoch: keeps track of the epoch in which the best score was set
|
||||
:ivar STOP: flag (boolean) indicating the stopping condition
|
||||
:ivar IMPROVED: flag (boolean) indicating whether there was an improvement in the last call
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, patience, lower_is_better=True):
|
||||
|
||||
self.PATIENCE_LIMIT = patience
|
||||
self.better = lambda a,b: a<b if lower_is_better else a>b
|
||||
self.patience = patience
|
||||
|
@ -141,6 +216,14 @@ class EarlyStop:
|
|||
self.IMPROVED = False
|
||||
|
||||
def __call__(self, watch_score, epoch):
|
||||
"""
|
||||
Commits the new score found in epoch `epoch`. If the score improves over the best score found so far, then
|
||||
the patiente counter gets reset. If otherwise, the patience counter is decreased, and in case it reachs 0,
|
||||
the flag STOP becomes True.
|
||||
|
||||
:param watch_score: the new score
|
||||
:param epoch: the current epoch
|
||||
"""
|
||||
self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score))
|
||||
if self.IMPROVED:
|
||||
self.best_score = watch_score
|
||||
|
|
Loading…
Reference in New Issue