forked from moreo/QuaPy
testing baselines for lequa
This commit is contained in:
parent
1a3755eb58
commit
7468519495
|
@ -1,11 +1,6 @@
|
||||||
2. tablas?
|
2. tablas?
|
||||||
3. fetch dataset (download, unzip, etc.)
|
3. fetch dataset (download, unzip, etc.)
|
||||||
4. model selection
|
|
||||||
5. plots
|
5. plots
|
||||||
8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio
|
|
||||||
9. Experimentar con vectores densos (PCA sobre tfidf por ejemplo)
|
|
||||||
10. Si cambiamos el formato de los samples (por ejemplo, en lugar de svmlight con .txt a PCA con .dat) hay que cambiar
|
|
||||||
cosas en el código. Está escrito varias veces un glob(*.txt)
|
|
||||||
11. Quitar las categorias como columnas de los ficheros de prevalences
|
11. Quitar las categorias como columnas de los ficheros de prevalences
|
||||||
12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc)
|
12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc)
|
||||||
13. repair doc of GridSearchQ
|
13. repair doc of GridSearchQ
|
||||||
|
|
|
@ -2,13 +2,14 @@ import argparse
|
||||||
import pickle
|
import pickle
|
||||||
from sklearn.linear_model import LogisticRegression as LR
|
from sklearn.linear_model import LogisticRegression as LR
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
|
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
from data import *
|
from data import *
|
||||||
import os
|
import os
|
||||||
import constants
|
import constants
|
||||||
|
|
||||||
|
|
||||||
# LeQua official baselines for task T1B (Multiclass/Vector)
|
# LeQua official baselines for task T1A (Binary/Vector) and T1B (Multiclass/Vector)
|
||||||
# =========================================================
|
# =========================================================
|
||||||
|
|
||||||
def baselines():
|
def baselines():
|
||||||
|
@ -17,7 +18,8 @@ def baselines():
|
||||||
yield PCC(LR(n_jobs=-1)), "PCC"
|
yield PCC(LR(n_jobs=-1)), "PCC"
|
||||||
yield PACC(LR(n_jobs=-1)), "PACC"
|
yield PACC(LR(n_jobs=-1)), "PACC"
|
||||||
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
||||||
yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
||||||
|
# yield MLPE(), "MLPE"
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
@ -30,7 +32,7 @@ def main(args):
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
||||||
|
|
||||||
train = LabelledCollection.load(path_train, load_binary_vectors)
|
train = LabelledCollection.load(path_train, load_vector_documents)
|
||||||
nF = train.instances.shape[1]
|
nF = train.instances.shape[1]
|
||||||
|
|
||||||
print(f'number of classes: {len(train.classes_)}')
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
|
@ -38,13 +40,19 @@ def main(args):
|
||||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||||
print(f'training matrix shape: {train.instances.shape}')
|
print(f'training matrix shape: {train.instances.shape}')
|
||||||
|
|
||||||
|
# param_grid = {
|
||||||
|
# 'C': np.logspace(-3, 3, 7),
|
||||||
|
# 'class_weight': ['balanced', None]
|
||||||
|
# }
|
||||||
|
|
||||||
param_grid = {
|
param_grid = {
|
||||||
'C': np.logspace(-3,3,7),
|
'C': [1],
|
||||||
'class_weight': ['balanced', None]
|
'class_weight': ['balanced']
|
||||||
}
|
}
|
||||||
|
|
||||||
def gen_samples():
|
def gen_samples():
|
||||||
return gen_load_samples_T1(path_dev_vectors, nF, ground_truth_path=path_dev_prevs, return_id=False)
|
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
||||||
|
load_fn=load_vector_documents, nF=nF)
|
||||||
|
|
||||||
for quantifier, q_name in baselines():
|
for quantifier, q_name in baselines():
|
||||||
print(f'{q_name}: Model selection')
|
print(f'{q_name}: Model selection')
|
||||||
|
|
|
@ -13,7 +13,7 @@ SAMPLE_SIZE={
|
||||||
'T1A': T1A_SAMPLE_SIZE,
|
'T1A': T1A_SAMPLE_SIZE,
|
||||||
'T1B': T1B_SAMPLE_SIZE,
|
'T1B': T1B_SAMPLE_SIZE,
|
||||||
'T2A': T2A_SAMPLE_SIZE,
|
'T2A': T2A_SAMPLE_SIZE,
|
||||||
'T2A': T2B_SAMPLE_SIZE
|
'T2B': T2B_SAMPLE_SIZE
|
||||||
}
|
}
|
||||||
|
|
||||||
ERROR_TOL = 1E-3
|
ERROR_TOL = 1E-3
|
||||||
|
|
|
@ -12,17 +12,6 @@ from glob import glob
|
||||||
import constants
|
import constants
|
||||||
|
|
||||||
|
|
||||||
# def load_binary_raw_document(path):
|
|
||||||
# documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
|
|
||||||
# labels = np.asarray(labels)
|
|
||||||
# labels[np.logical_or(labels == 1, labels == 2)] = 0
|
|
||||||
# labels[np.logical_or(labels == 4, labels == 5)] = 1
|
|
||||||
# return documents, labels
|
|
||||||
|
|
||||||
|
|
||||||
# def load_multiclass_raw_document(path):
|
|
||||||
# return qp.data.from_text(path, verbose=0, class2int=False)
|
|
||||||
|
|
||||||
def load_category_map(path):
|
def load_category_map(path):
|
||||||
cat2code = {}
|
cat2code = {}
|
||||||
with open(path, 'rt') as fin:
|
with open(path, 'rt') as fin:
|
||||||
|
@ -33,7 +22,19 @@ def load_category_map(path):
|
||||||
return cat2code, code2cat
|
return cat2code, code2cat
|
||||||
|
|
||||||
|
|
||||||
def load_binary_vectors(path, nF=None):
|
def load_raw_documents(path):
|
||||||
|
return qp.data.from_text(path, verbose=0, class2int=True)
|
||||||
|
|
||||||
|
|
||||||
|
def load_raw_unlabelled_documents(path, vectorizer=None):
|
||||||
|
with open(path, 'rt', encoding='utf-8') as file:
|
||||||
|
documents = [d.strip() for d in file.readlines()]
|
||||||
|
if vectorizer:
|
||||||
|
documents = vectorizer.transform(documents)
|
||||||
|
return documents, None
|
||||||
|
|
||||||
|
|
||||||
|
def load_vector_documents(path, nF=None):
|
||||||
X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF)
|
X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF)
|
||||||
y = y.astype(int)
|
y = y.astype(int)
|
||||||
return X, y
|
return X, y
|
||||||
|
@ -53,13 +54,13 @@ def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn,
|
||||||
yield (id, sample) if return_id else sample
|
yield (id, sample) if return_id else sample
|
||||||
|
|
||||||
|
|
||||||
def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_id=True):
|
def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, load_fn=load_vector_documents, **load_kwargs):
|
||||||
if ground_truth_path is None:
|
if ground_truth_path is None:
|
||||||
# the generator function returns tuples (filename:str, sample:csr_matrix)
|
# the generator function returns tuples (docid:str, sample:csr_matrix or str)
|
||||||
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_binary_vectors, nF=nF)
|
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs)
|
||||||
else:
|
else:
|
||||||
# the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
|
# the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
|
||||||
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_binary_vectors, nF=nF)
|
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs)
|
||||||
for r in gen_fn:
|
for r in gen_fn:
|
||||||
yield r
|
yield r
|
||||||
|
|
||||||
|
@ -75,16 +76,6 @@ def genSVD_load_samples_T1(load_fn, path_dir:str, nF:int, ground_truth_path:str
|
||||||
yield r
|
yield r
|
||||||
|
|
||||||
|
|
||||||
def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):
|
|
||||||
# for ... : yield
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None):
|
|
||||||
# for ... : yield
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class ResultSubmission:
|
class ResultSubmission:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
|
@ -5,7 +5,7 @@ import constants
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from data import gen_load_samples_T1
|
from data import gen_load_samples
|
||||||
from glob import glob
|
from glob import glob
|
||||||
import constants
|
import constants
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ def main(args):
|
||||||
|
|
||||||
# predictions
|
# predictions
|
||||||
predictions = ResultSubmission()
|
predictions = ResultSubmission()
|
||||||
for sampleid, sample in tqdm(gen_load_samples_T1(args.samples, args.nf), desc='predicting', total=nsamples):
|
for sampleid, sample in tqdm(gen_load_samples(args.samples, args.nf), desc='predicting', total=nsamples):
|
||||||
predictions.add(sampleid, model.quantify(sample))
|
predictions.add(sampleid, model.quantify(sample))
|
||||||
|
|
||||||
# saving
|
# saving
|
||||||
|
|
|
@ -941,8 +941,6 @@
|
||||||
<li><a href="quapy.data.html#quapy.data.base.LabelledCollection.sampling_from_index">sampling_from_index() (quapy.data.base.LabelledCollection method)</a>
|
<li><a href="quapy.data.html#quapy.data.base.LabelledCollection.sampling_from_index">sampling_from_index() (quapy.data.base.LabelledCollection method)</a>
|
||||||
</li>
|
</li>
|
||||||
<li><a href="quapy.data.html#quapy.data.base.LabelledCollection.sampling_index">sampling_index() (quapy.data.base.LabelledCollection method)</a>
|
<li><a href="quapy.data.html#quapy.data.base.LabelledCollection.sampling_index">sampling_index() (quapy.data.base.LabelledCollection method)</a>
|
||||||
</li>
|
|
||||||
<li><a href="quapy.html#quapy.plot.save_or_show">save_or_show() (in module quapy.plot)</a>
|
|
||||||
</li>
|
</li>
|
||||||
<li><a href="quapy.html#quapy.util.save_text_file">save_text_file() (in module quapy.util)</a>
|
<li><a href="quapy.html#quapy.util.save_text_file">save_text_file() (in module quapy.util)</a>
|
||||||
</li>
|
</li>
|
||||||
|
|
Binary file not shown.
|
@ -721,12 +721,21 @@ being ignored, a TimeoutError exception is raised. If -1 (default) then no time
|
||||||
<dl class="py method">
|
<dl class="py method">
|
||||||
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.best_model">
|
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.best_model">
|
||||||
<span class="sig-name descname"><span class="pre">best_model</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.best_model" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-name descname"><span class="pre">best_model</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.best_model" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Returns the best model found after calling the <a class="reference internal" href="#quapy.model_selection.GridSearchQ.fit" title="quapy.model_selection.GridSearchQ.fit"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a> method, i.e., the one trained on the combination
|
||||||
|
of hyper-parameters that minimized the error function.</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Returns</dt>
|
||||||
|
<dd class="field-odd"><p>a trained quantifier</p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py property">
|
<dl class="py property">
|
||||||
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.classes_">
|
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.classes_">
|
||||||
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.classes_" title="Permalink to this definition">¶</a></dt>
|
<em class="property"><span class="pre">property</span> </em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.classes_" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Classes on which the quantifier has been trained on.
|
||||||
|
:return: a ndarray of shape <cite>(n_classes)</cite> with the class identifiers</p>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py method">
|
<dl class="py method">
|
||||||
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.fit">
|
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.fit">
|
||||||
|
@ -743,6 +752,9 @@ being ignored, a TimeoutError exception is raised. If -1 (default) then no time
|
||||||
a float in [0,1] indicating the proportion of labelled data to extract from the training set</p></li>
|
a float in [0,1] indicating the proportion of labelled data to extract from the training set</p></li>
|
||||||
</ul>
|
</ul>
|
||||||
</dd>
|
</dd>
|
||||||
|
<dt class="field-even">Returns</dt>
|
||||||
|
<dd class="field-even"><p>self</p>
|
||||||
|
</dd>
|
||||||
</dl>
|
</dl>
|
||||||
</dd></dl>
|
</dd></dl>
|
||||||
|
|
||||||
|
@ -763,11 +775,15 @@ a float in [0,1] indicating the proportion of labelled data to extract from the
|
||||||
<dl class="py method">
|
<dl class="py method">
|
||||||
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.quantify">
|
<dt class="sig sig-object py" id="quapy.model_selection.GridSearchQ.quantify">
|
||||||
<span class="sig-name descname"><span class="pre">quantify</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.quantify" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-name descname"><span class="pre">quantify</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.model_selection.GridSearchQ.quantify" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd><p>Estimate class prevalence values</p>
|
<dd><p>Estimate class prevalence values using the best model found after calling the <a class="reference internal" href="#quapy.model_selection.GridSearchQ.fit" title="quapy.model_selection.GridSearchQ.fit"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a> method.</p>
|
||||||
<dl class="field-list simple">
|
<dl class="field-list simple">
|
||||||
<dt class="field-odd">Parameters</dt>
|
<dt class="field-odd">Parameters</dt>
|
||||||
<dd class="field-odd"><p><strong>instances</strong> – sample contanining the instances</p>
|
<dd class="field-odd"><p><strong>instances</strong> – sample contanining the instances</p>
|
||||||
</dd>
|
</dd>
|
||||||
|
<dt class="field-even">Returns</dt>
|
||||||
|
<dd class="field-even"><p>a ndarray of shape <cite>(n_classes)</cite> with class prevalence estimates as according to the best model found
|
||||||
|
by the model selection process.</p>
|
||||||
|
</dd>
|
||||||
</dl>
|
</dl>
|
||||||
</dd></dl>
|
</dd></dl>
|
||||||
|
|
||||||
|
@ -790,7 +806,9 @@ a float in [0,1] indicating the proportion of labelled data to extract from the
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.plot.binary_bias_bins">
|
<dt class="sig sig-object py" id="quapy.plot.binary_bias_bins">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">binary_bias_bins</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">method_names</span></em>, <em class="sig-param"><span class="pre">true_prevs</span></em>, <em class="sig-param"><span class="pre">estim_prevs</span></em>, <em class="sig-param"><span class="pre">pos_class=1</span></em>, <em class="sig-param"><span class="pre">title=None</span></em>, <em class="sig-param"><span class="pre">nbins=5</span></em>, <em class="sig-param"><span class="pre">colormap=<matplotlib.colors.ListedColormap</span> <span class="pre">object></span></em>, <em class="sig-param"><span class="pre">vertical_xticks=False</span></em>, <em class="sig-param"><span class="pre">legend=True</span></em>, <em class="sig-param"><span class="pre">savepath=None</span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.binary_bias_bins" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">binary_bias_bins</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="pre">method_names</span></em>, <em class="sig-param"><span class="pre">true_prevs</span></em>, <em class="sig-param"><span class="pre">estim_prevs</span></em>, <em class="sig-param"><span class="pre">pos_class=1</span></em>, <em class="sig-param"><span class="pre">title=None</span></em>, <em class="sig-param"><span class="pre">nbins=5</span></em>, <em class="sig-param"><span class="pre">colormap=<matplotlib.colors.ListedColormap</span> <span class="pre">object></span></em>, <em class="sig-param"><span class="pre">vertical_xticks=False</span></em>, <em class="sig-param"><span class="pre">legend=True</span></em>, <em class="sig-param"><span class="pre">savepath=None</span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.binary_bias_bins" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd><dl class="field-list simple">
|
<dd><p>Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value)
|
||||||
|
for different bins of (true) prevalence of the positive classs, for each quantification method.</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
<dt class="field-odd">Parameters</dt>
|
<dt class="field-odd">Parameters</dt>
|
||||||
<dd class="field-odd"><ul class="simple">
|
<dd class="field-odd"><ul class="simple">
|
||||||
<li><p><strong>method_names</strong> – array-like with the method names for each experiment</p></li>
|
<li><p><strong>method_names</strong> – array-like with the method names for each experiment</p></li>
|
||||||
|
@ -802,7 +820,7 @@ for each experiment</p></li>
|
||||||
<li><p><strong>title</strong> – the title to be displayed in the plot</p></li>
|
<li><p><strong>title</strong> – the title to be displayed in the plot</p></li>
|
||||||
<li><p><strong>nbins</strong> – number of bins</p></li>
|
<li><p><strong>nbins</strong> – number of bins</p></li>
|
||||||
<li><p><strong>colormap</strong> – the matplotlib colormap to use (default cm.tab10)</p></li>
|
<li><p><strong>colormap</strong> – the matplotlib colormap to use (default cm.tab10)</p></li>
|
||||||
<li><p><strong>vertical_xticks</strong> – </p></li>
|
<li><p><strong>vertical_xticks</strong> – whether or not to add secondary grid (default is False)</p></li>
|
||||||
<li><p><strong>legend</strong> – whether or not to display the legend (default is True)</p></li>
|
<li><p><strong>legend</strong> – whether or not to display the legend (default is True)</p></li>
|
||||||
<li><p><strong>savepath</strong> – path where to save the plot. If not indicated (as default), the plot is shown.</p></li>
|
<li><p><strong>savepath</strong> – path where to save the plot. If not indicated (as default), the plot is shown.</p></li>
|
||||||
</ul>
|
</ul>
|
||||||
|
@ -865,17 +883,77 @@ listed in the legend and associated with matplotlib colors).</p></li>
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.plot.brokenbar_supremacy_by_drift">
|
<dt class="sig sig-object py" id="quapy.plot.brokenbar_supremacy_by_drift">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">brokenbar_supremacy_by_drift</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">true_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estim_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tr_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_bins</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">binning</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'isomerous'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">x_error</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y_error</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ttest_alpha</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.005</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tail_density_threshold</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.005</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method_order</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">savepath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.brokenbar_supremacy_by_drift" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">brokenbar_supremacy_by_drift</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">true_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estim_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tr_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_bins</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">binning</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'isomerous'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">x_error</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y_error</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ttest_alpha</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.005</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tail_density_threshold</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.005</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method_order</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">savepath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.brokenbar_supremacy_by_drift" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Displays (only) the top performing methods for different regions of the train-test shift in form of a broken
|
||||||
|
bar chart, in which each method has bars only for those regions in which either one of the following conditions
|
||||||
|
hold: (i) it is the best method (in average) for the bin, or (ii) it is not statistically significantly different
|
||||||
|
(in average) as according to a two-sided t-test on independent samples at confidence <cite>ttest_alpha</cite>.
|
||||||
|
The binning can be made “isometric” (same size), or “isomerous” (same number of experiments – default). A second
|
||||||
|
plot is displayed on top, that displays the distribution of experiments for each bin (when binning=”isometric”) or
|
||||||
|
the percentiles points of the distribution (when binning=”isomerous”).</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><ul class="simple">
|
||||||
|
<li><p><strong>method_names</strong> – array-like with the method names for each experiment</p></li>
|
||||||
|
<li><p><strong>true_prevs</strong> – array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||||
|
each experiment</p></li>
|
||||||
|
<li><p><strong>estim_prevs</strong> – array-like with the estimated prevalence values (each being a ndarray with n_classes components)
|
||||||
|
for each experiment</p></li>
|
||||||
|
<li><p><strong>tr_prevs</strong> – training prevalence of each experiment</p></li>
|
||||||
|
<li><p><strong>n_bins</strong> – number of bins in which the y-axis is to be divided (default is 20)</p></li>
|
||||||
|
<li><p><strong>binning</strong> – type of binning, either “isomerous” (default) or “isometric”</p></li>
|
||||||
|
<li><p><strong>x_error</strong> – a string representing the name of an error function (as defined in <cite>quapy.error</cite>) to be used for
|
||||||
|
measuring the amount of train-test shift (default is “ae”)</p></li>
|
||||||
|
<li><p><strong>y_error</strong> – a string representing the name of an error function (as defined in <cite>quapy.error</cite>) to be used for
|
||||||
|
measuring the amount of error in the prevalence estimations (default is “ae”)</p></li>
|
||||||
|
<li><p><strong>ttest_alpha</strong> – the confidence interval above which a p-value (two-sided t-test on independent samples) is
|
||||||
|
to be considered as an indicator that the two means are not statistically significantly different. Default is
|
||||||
|
0.005, meaning that a <cite>p-value > 0.005</cite> indicates the two methods involved are to be considered similar</p></li>
|
||||||
|
<li><p><strong>tail_density_threshold</strong> – sets a threshold on the density of experiments (over the total number of experiments)
|
||||||
|
below which a bin in the tail (i.e., the right-most ones) will be discarded. This is in order to avoid some
|
||||||
|
bins to be shown for train-test outliers.</p></li>
|
||||||
|
<li><p><strong>method_order</strong> – if indicated (default is None), imposes the order in which the methods are processed (i.e.,
|
||||||
|
listed in the legend and associated with matplotlib colors).</p></li>
|
||||||
|
<li><p><strong>savepath</strong> – path where to save the plot. If not indicated (as default), the plot is shown.</p></li>
|
||||||
|
</ul>
|
||||||
|
</dd>
|
||||||
|
<dt class="field-even">Returns</dt>
|
||||||
|
<dd class="field-even"><p></p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.plot.error_by_drift">
|
<dt class="sig sig-object py" id="quapy.plot.error_by_drift">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">error_by_drift</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">true_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estim_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tr_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_bins</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">error_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">show_std</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">show_density</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logscale</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'Quantification</span> <span class="pre">error</span> <span class="pre">as</span> <span class="pre">a</span> <span class="pre">function</span> <span class="pre">of</span> <span class="pre">distribution</span> <span class="pre">shift'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">savepath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">vlines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method_order</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.error_by_drift" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">error_by_drift</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">method_names</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">true_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estim_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tr_prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_bins</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">error_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'ae'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">show_std</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">show_density</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">logscale</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">title</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'Quantification</span> <span class="pre">error</span> <span class="pre">as</span> <span class="pre">a</span> <span class="pre">function</span> <span class="pre">of</span> <span class="pre">distribution</span> <span class="pre">shift'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">vlines</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">method_order</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">savepath</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.error_by_drift" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Plots the error (along the x-axis, as measured in terms of <cite>error_name</cite>) as a function of the train-test shift
|
||||||
|
(along the y-axis, as measured in terms of <a class="reference internal" href="#quapy.error.ae" title="quapy.error.ae"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.error.ae()</span></code></a>). This plot is useful especially for multiclass
|
||||||
<dl class="py function">
|
problems, in which “diagonal plots” may be cumbersone, and in order to gain understanding about how methods
|
||||||
<dt class="sig sig-object py" id="quapy.plot.save_or_show">
|
fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.plot.</span></span><span class="sig-name descname"><span class="pre">save_or_show</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">savepath</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.plot.save_or_show" title="Permalink to this definition">¶</a></dt>
|
high-shift regime).</p>
|
||||||
<dd></dd></dl>
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><ul class="simple">
|
||||||
|
<li><p><strong>method_names</strong> – array-like with the method names for each experiment</p></li>
|
||||||
|
<li><p><strong>true_prevs</strong> – array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||||
|
each experiment</p></li>
|
||||||
|
<li><p><strong>estim_prevs</strong> – array-like with the estimated prevalence values (each being a ndarray with n_classes components)
|
||||||
|
for each experiment</p></li>
|
||||||
|
<li><p><strong>tr_prevs</strong> – training prevalence of each experiment</p></li>
|
||||||
|
<li><p><strong>n_bins</strong> – number of bins in which the y-axis is to be divided (default is 20)</p></li>
|
||||||
|
<li><p><strong>error_name</strong> – a string representing the name of an error function (as defined in <cite>quapy.error</cite>, default is “ae”)</p></li>
|
||||||
|
<li><p><strong>show_std</strong> – whether or not to show standard deviations as color bands (default is False)</p></li>
|
||||||
|
<li><p><strong>show_density</strong> – whether or not to display the distribution of experiments for each bin (default is True)</p></li>
|
||||||
|
<li><p><strong>logscale</strong> – whether or not to log-scale the y-error measure (default is False)</p></li>
|
||||||
|
<li><p><strong>title</strong> – title of the plot (default is “Quantification error as a function of distribution shift”)</p></li>
|
||||||
|
<li><p><strong>vlines</strong> – array-like list of values (default is None). If indicated, highlights some regions of the space
|
||||||
|
using vertical dotted lines.</p></li>
|
||||||
|
<li><p><strong>method_order</strong> – if indicated (default is None), imposes the order in which the methods are processed (i.e.,
|
||||||
|
listed in the legend and associated with matplotlib colors).</p></li>
|
||||||
|
<li><p><strong>savepath</strong> – path where to save the plot. If not indicated (as default), the plot is shown.</p></li>
|
||||||
|
</ul>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
</section>
|
</section>
|
||||||
<section id="module-quapy.util">
|
<section id="module-quapy.util">
|
||||||
|
@ -884,82 +962,184 @@ listed in the legend and associated with matplotlib colors).</p></li>
|
||||||
<dt class="sig sig-object py" id="quapy.util.EarlyStop">
|
<dt class="sig sig-object py" id="quapy.util.EarlyStop">
|
||||||
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">EarlyStop</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">patience</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lower_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.EarlyStop" title="Permalink to this definition">¶</a></dt>
|
<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">EarlyStop</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">patience</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lower_is_better</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.EarlyStop" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
|
||||||
|
<p>A class implementing the early-stopping condition typically used for training neural networks.</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><p><strong>patience</strong> – the number of (consecutive) times that a monitored evaluation metric (typically obtaind in a</p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
<p>held-out validation split) can be found to be worse than the best one obtained so far, before flagging the
|
||||||
|
stopping condition. An instance of this class is <cite>callable</cite>, and is to be used as follows:</p>
|
||||||
|
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">earlystop</span> <span class="o">=</span> <span class="n">EarlyStop</span><span class="p">(</span><span class="n">patience</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">lower_is_better</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="p">(</span><span class="mf">0.9</span><span class="p">,</span> <span class="n">epoch</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="p">(</span><span class="mf">0.7</span><span class="p">,</span> <span class="n">epoch</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">IMPROVED</span> <span class="c1"># is True</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">epoch</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">STOP</span> <span class="c1"># is False (patience=1)</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="n">epoch</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">STOP</span> <span class="c1"># is True (patience=0)</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">best_epoch</span> <span class="c1"># is 1</span>
|
||||||
|
<span class="gp">>>> </span><span class="n">earlystop</span><span class="o">.</span><span class="n">best_score</span> <span class="c1"># is 0.7</span>
|
||||||
|
</pre></div>
|
||||||
|
</div>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><p><strong>lower_is_better</strong> – if True (default) the metric is to be minimized.</p>
|
||||||
|
</dd>
|
||||||
|
<dt class="field-even">Variables</dt>
|
||||||
|
<dd class="field-even"><ul class="simple">
|
||||||
|
<li><p><strong>best_score</strong> – keeps track of the best value seen so far</p></li>
|
||||||
|
<li><p><strong>best_epoch</strong> – keeps track of the epoch in which the best score was set</p></li>
|
||||||
|
<li><p><strong>STOP</strong> – flag (boolean) indicating the stopping condition</p></li>
|
||||||
|
<li><p><strong>IMPROVED</strong> – flag (boolean) indicating whether there was an improvement in the last call</p></li>
|
||||||
|
</ul>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
</dd></dl>
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.create_if_not_exist">
|
<dt class="sig sig-object py" id="quapy.util.create_if_not_exist">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">create_if_not_exist</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.create_if_not_exist" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">create_if_not_exist</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.create_if_not_exist" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>An alias to <cite>os.makedirs(path, exist_ok=True)</cite> that also returns the path. This is useful in cases like, e.g.:</p>
|
||||||
|
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">path</span> <span class="o">=</span> <span class="n">create_if_not_exist</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="nb">dir</span><span class="p">,</span> <span class="n">subdir</span><span class="p">,</span> <span class="n">anotherdir</span><span class="p">))</span>
|
||||||
|
</pre></div>
|
||||||
|
</div>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><p><strong>path</strong> – path to create</p>
|
||||||
|
</dd>
|
||||||
|
<dt class="field-even">Returns</dt>
|
||||||
|
<dd class="field-even"><p>the path itself</p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.create_parent_dir">
|
<dt class="sig sig-object py" id="quapy.util.create_parent_dir">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">create_parent_dir</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.create_parent_dir" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">create_parent_dir</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.create_parent_dir" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Creates the parent dir (if any) of a given path, if not exists. E.g., for <cite>./path/to/file.txt</cite>, the path <cite>./path/to</cite>
|
||||||
|
is created.</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><p><strong>path</strong> – the path</p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.download_file">
|
<dt class="sig sig-object py" id="quapy.util.download_file">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">download_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">archive_filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.download_file" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">download_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">archive_filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.download_file" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Downloads a file from a url</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><ul class="simple">
|
||||||
|
<li><p><strong>url</strong> – the url</p></li>
|
||||||
|
<li><p><strong>archive_filename</strong> – destination filename</p></li>
|
||||||
|
</ul>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.download_file_if_not_exists">
|
<dt class="sig sig-object py" id="quapy.util.download_file_if_not_exists">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">download_file_if_not_exists</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">archive_path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.download_file_if_not_exists" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">download_file_if_not_exists</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">archive_filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.download_file_if_not_exists" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Dowloads a function (using <a class="reference internal" href="#quapy.util.download_file" title="quapy.util.download_file"><code class="xref py py-meth docutils literal notranslate"><span class="pre">download_file()</span></code></a>) if the file does not exist.</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><ul class="simple">
|
||||||
|
<li><p><strong>url</strong> – the url</p></li>
|
||||||
|
<li><p><strong>archive_filename</strong> – destination filename</p></li>
|
||||||
|
</ul>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.get_quapy_home">
|
<dt class="sig sig-object py" id="quapy.util.get_quapy_home">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">get_quapy_home</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.get_quapy_home" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">get_quapy_home</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.get_quapy_home" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Gets the home directory of QuaPy, i.e., the directory where QuaPy saves permanent data, such as dowloaded datasets.</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Returns</dt>
|
||||||
|
<dd class="field-odd"><p>a string representing the path</p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.map_parallel">
|
<dt class="sig sig-object py" id="quapy.util.map_parallel">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">map_parallel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.map_parallel" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">map_parallel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.map_parallel" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd><p>Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
|
<dd><p>Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
|
||||||
func is applied in two parallel processes to args[0:50] and to args[50:99]</p>
|
func is applied in two parallel processes to args[0:50] and to args[50:99]</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><ul class="simple">
|
||||||
|
<li><p><strong>func</strong> – function to be parallelized</p></li>
|
||||||
|
<li><p><strong>args</strong> – array-like of arguments to be passed to the function in different parallel calls</p></li>
|
||||||
|
<li><p><strong>n_jobs</strong> – the number of workers</p></li>
|
||||||
|
</ul>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
</dd></dl>
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.parallel">
|
<dt class="sig sig-object py" id="quapy.util.parallel">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">parallel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.parallel" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">parallel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.parallel" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd><p>A wrapper of multiprocessing:
|
<dd><p>A wrapper of multiprocessing:</p>
|
||||||
Parallel(n_jobs=n_jobs)(</p>
|
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">Parallel</span><span class="p">(</span><span class="n">n_jobs</span><span class="o">=</span><span class="n">n_jobs</span><span class="p">)(</span>
|
||||||
<blockquote>
|
<span class="gp">>>> </span> <span class="n">delayed</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span><span class="n">args_i</span><span class="p">)</span> <span class="k">for</span> <span class="n">args_i</span> <span class="ow">in</span> <span class="n">args</span>
|
||||||
<div><p>delayed(func)(args_i) for args_i in args</p>
|
<span class="gp">>>> </span><span class="p">)</span>
|
||||||
</div></blockquote>
|
</pre></div>
|
||||||
<p>)
|
</div>
|
||||||
that takes the quapy.environ variable as input silently</p>
|
<p>that takes the <cite>quapy.environ</cite> variable as input silently</p>
|
||||||
</dd></dl>
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.pickled_resource">
|
<dt class="sig sig-object py" id="quapy.util.pickled_resource">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">pickled_resource</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pickle_path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">generation_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.pickled_resource" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">pickled_resource</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pickle_path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">generation_func</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.pickled_resource" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd><p>Allows for fast reuse of resources that are generated only once by calling generation_func(<a href="#id1"><span class="problematic" id="id2">*</span></a>args). The next times
|
<dd><p>Allows for fast reuse of resources that are generated only once by calling generation_func(<a href="#id1"><span class="problematic" id="id2">*</span></a>args). The next times
|
||||||
this function is invoked, it loads the pickled resource. Example:
|
this function is invoked, it loads the pickled resource. Example:</p>
|
||||||
def some_array(n):</p>
|
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="k">def</span> <span class="nf">some_array</span><span class="p">(</span><span class="n">n</span><span class="p">):</span> <span class="c1"># a mock resource created with one parameter (`n`)</span>
|
||||||
<blockquote>
|
<span class="gp">>>> </span> <span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">n</span><span class="p">)</span>
|
||||||
<div><p>return np.random.rand(n)</p>
|
<span class="gp">>>> </span><span class="n">pickled_resource</span><span class="p">(</span><span class="s1">'./my_array.pkl'</span><span class="p">,</span> <span class="n">some_array</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="c1"># the resource does not exist: it is created by calling some_array(10)</span>
|
||||||
</div></blockquote>
|
<span class="gp">>>> </span><span class="n">pickled_resource</span><span class="p">(</span><span class="s1">'./my_array.pkl'</span><span class="p">,</span> <span class="n">some_array</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span> <span class="c1"># the resource exists; it is loaded from './my_array.pkl'</span>
|
||||||
<p>pickled_resource(‘./my_array.pkl’, some_array, 10) # the resource does not exist: it is created by some_array(10)
|
</pre></div>
|
||||||
pickled_resource(‘./my_array.pkl’, some_array, 10) # the resource exists: it is loaded from ‘./my_array.pkl’
|
</div>
|
||||||
:param pickle_path: the path where to save (first time) and load (next times) the resource
|
<dl class="field-list simple">
|
||||||
:param generation_func: the function that generates the resource, in case it does not exist in pickle_path
|
<dt class="field-odd">Parameters</dt>
|
||||||
:param args: any arg that generation_func uses for generating the resources
|
<dd class="field-odd"><ul class="simple">
|
||||||
:return: the resource</p>
|
<li><p><strong>pickle_path</strong> – the path where to save (first time) and load (next times) the resource</p></li>
|
||||||
|
<li><p><strong>generation_func</strong> – the function that generates the resource, in case it does not exist in pickle_path</p></li>
|
||||||
|
<li><p><strong>args</strong> – any arg that generation_func uses for generating the resources</p></li>
|
||||||
|
</ul>
|
||||||
|
</dd>
|
||||||
|
<dt class="field-even">Returns</dt>
|
||||||
|
<dd class="field-even"><p>the resource</p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
</dd></dl>
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.save_text_file">
|
<dt class="sig sig-object py" id="quapy.util.save_text_file">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">save_text_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">text</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.save_text_file" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">save_text_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">text</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.save_text_file" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd></dd></dl>
|
<dd><p>Saves a text file to disk, given its full path, and creates the parent directory if missing.</p>
|
||||||
|
<dl class="field-list simple">
|
||||||
|
<dt class="field-odd">Parameters</dt>
|
||||||
|
<dd class="field-odd"><ul class="simple">
|
||||||
|
<li><p><strong>path</strong> – path where to save the path.</p></li>
|
||||||
|
<li><p><strong>text</strong> – text to save.</p></li>
|
||||||
|
</ul>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
</dd></dl>
|
||||||
|
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.util.temp_seed">
|
<dt class="sig sig-object py" id="quapy.util.temp_seed">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">temp_seed</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.temp_seed" title="Permalink to this definition">¶</a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.util.</span></span><span class="sig-name descname"><span class="pre">temp_seed</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#quapy.util.temp_seed" title="Permalink to this definition">¶</a></dt>
|
||||||
<dd><p>Can be used in a “with” context to set a temporal seed without modifying the outer numpy’s current state. E.g.:
|
<dd><p>Can be used in a “with” context to set a temporal seed without modifying the outer numpy’s current state. E.g.:</p>
|
||||||
with temp_seed(random_seed):</p>
|
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="k">with</span> <span class="n">temp_seed</span><span class="p">(</span><span class="n">random_seed</span><span class="p">):</span>
|
||||||
<blockquote>
|
<span class="gp">>>> </span> <span class="k">pass</span> <span class="c1"># do any computation depending on np.random functionality</span>
|
||||||
<div><p># do any computation depending on np.random functionality</p>
|
</pre></div>
|
||||||
</div></blockquote>
|
</div>
|
||||||
<dl class="field-list simple">
|
<dl class="field-list simple">
|
||||||
<dt class="field-odd">Parameters</dt>
|
<dt class="field-odd">Parameters</dt>
|
||||||
<dd class="field-odd"><p><strong>seed</strong> – the seed to set within the “with” context</p>
|
<dd class="field-odd"><p><strong>seed</strong> – the seed to set within the “with” context</p>
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -18,13 +18,16 @@ def from_text(path, encoding='utf-8', verbose=1, class2int=True):
|
||||||
for line in file:
|
for line in file:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line:
|
if line:
|
||||||
label, sentence = line.split('\t')
|
try:
|
||||||
sentence = sentence.strip()
|
label, sentence = line.split('\t')
|
||||||
if class2int:
|
sentence = sentence.strip()
|
||||||
label = int(label)
|
if class2int:
|
||||||
if sentence:
|
label = int(label)
|
||||||
all_sentences.append(sentence)
|
if sentence:
|
||||||
all_labels.append(label)
|
all_sentences.append(sentence)
|
||||||
|
all_labels.append(label)
|
||||||
|
except ValueError:
|
||||||
|
print(f'format error in {line}')
|
||||||
return all_sentences, all_labels
|
return all_sentences, all_labels
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,25 @@ import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, return_constrained_dim=False):
|
def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, return_constrained_dim=False):
|
||||||
|
"""
|
||||||
|
Generates vectors of prevalence values artificially drawn from an exhaustive grid of prevalence values. The
|
||||||
|
number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
|
||||||
|
`n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only
|
||||||
|
valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
|
||||||
|
valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be
|
||||||
|
implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained
|
||||||
|
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to 1).
|
||||||
|
|
||||||
|
:param dimensions: the number of classes
|
||||||
|
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the grid
|
||||||
|
(default is 21)
|
||||||
|
:param repeat: number of copies for each valid prevalence vector (default is 1)
|
||||||
|
:param return_constrained_dim: set to True to return all dimensions, or to False (default) for ommitting the
|
||||||
|
constrained dimension
|
||||||
|
:return: an ndarray of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape `(n, dimensions-1)`
|
||||||
|
if `return_constrained_dim=False`, where `n` is the number of valid combinations found in the grid multiplied
|
||||||
|
by `repeat`
|
||||||
|
"""
|
||||||
s = np.linspace(0., 1., n_prevalences, endpoint=True)
|
s = np.linspace(0., 1., n_prevalences, endpoint=True)
|
||||||
s = [s] * (dimensions - 1)
|
s = [s] * (dimensions - 1)
|
||||||
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p)<=1]
|
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p)<=1]
|
||||||
|
@ -18,9 +37,10 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur
|
||||||
|
|
||||||
def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
||||||
"""
|
"""
|
||||||
Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
|
Produces a uniformly separated values of prevalence. By default, produces an array of 21 prevalence values, with
|
||||||
and with the limits smoothed, i.e.:
|
step 0.05 and with the limits smoothed, i.e.:
|
||||||
[0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
|
[0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
|
||||||
|
|
||||||
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
|
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
|
||||||
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
|
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
|
||||||
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
|
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
|
||||||
|
@ -36,12 +56,20 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
||||||
return p
|
return p
|
||||||
|
|
||||||
|
|
||||||
def prevalence_from_labels(labels, classes_):
|
def prevalence_from_labels(labels, classes):
|
||||||
|
"""
|
||||||
|
Computed the prevalence values from a vector of labels.
|
||||||
|
|
||||||
|
:param labels: array-like of shape `(n_instances)` with the label for each instance
|
||||||
|
:param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when
|
||||||
|
some classes have no examples.
|
||||||
|
:return: an ndarray of shape `(len(classes))` with the class prevalence values
|
||||||
|
"""
|
||||||
if labels.ndim != 1:
|
if labels.ndim != 1:
|
||||||
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
||||||
unique, counts = np.unique(labels, return_counts=True)
|
unique, counts = np.unique(labels, return_counts=True)
|
||||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||||
prevalences = np.asarray([by_class[class_] for class_ in classes_], dtype=np.float)
|
prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=np.float)
|
||||||
prevalences /= prevalences.sum()
|
prevalences /= prevalences.sum()
|
||||||
return prevalences
|
return prevalences
|
||||||
|
|
||||||
|
|
|
@ -151,9 +151,11 @@ class GridSearchQ(BaseQuantifier):
|
||||||
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
|
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
|
||||||
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
|
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
|
||||||
the error metric.
|
the error metric.
|
||||||
|
|
||||||
:param training: the training set on which to optimize the hyperparameters
|
:param training: the training set on which to optimize the hyperparameters
|
||||||
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
||||||
a float in [0,1] indicating the proportion of labelled data to extract from the training set
|
a float in [0,1] indicating the proportion of labelled data to extract from the training set
|
||||||
|
:return: self
|
||||||
"""
|
"""
|
||||||
if val_split is None:
|
if val_split is None:
|
||||||
val_split = self.val_split
|
val_split = self.val_split
|
||||||
|
@ -213,15 +215,21 @@ class GridSearchQ(BaseQuantifier):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, instances):
|
def quantify(self, instances):
|
||||||
"""Estimate class prevalence values
|
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
|
||||||
|
|
||||||
:param instances: sample contanining the instances
|
:param instances: sample contanining the instances
|
||||||
|
:return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found
|
||||||
|
by the model selection process.
|
||||||
"""
|
"""
|
||||||
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
||||||
return self.best_model().quantify(instances)
|
return self.best_model().quantify(instances)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def classes_(self):
|
def classes_(self):
|
||||||
|
"""
|
||||||
|
Classes on which the quantifier has been trained on.
|
||||||
|
:return: a ndarray of shape `(n_classes)` with the class identifiers
|
||||||
|
"""
|
||||||
return self.best_model().classes_
|
return self.best_model().classes_
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
|
@ -240,6 +248,12 @@ class GridSearchQ(BaseQuantifier):
|
||||||
return self.param_grid
|
return self.param_grid
|
||||||
|
|
||||||
def best_model(self):
|
def best_model(self):
|
||||||
|
"""
|
||||||
|
Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination
|
||||||
|
of hyper-parameters that minimized the error function.
|
||||||
|
|
||||||
|
:return: a trained quantifier
|
||||||
|
"""
|
||||||
if hasattr(self, 'best_model_'):
|
if hasattr(self, 'best_model_'):
|
||||||
return self.best_model_
|
return self.best_model_
|
||||||
raise ValueError('best_model called before fit')
|
raise ValueError('best_model called before fit')
|
||||||
|
|
130
quapy/plot.py
130
quapy/plot.py
|
@ -82,7 +82,7 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
|
||||||
bbox_to_anchor=(1, -0.5),
|
bbox_to_anchor=(1, -0.5),
|
||||||
ncol=(len(method_names)+1)//2)
|
ncol=(len(method_names)+1)//2)
|
||||||
|
|
||||||
save_or_show(savepath)
|
_save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
|
def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
|
||||||
|
@ -116,12 +116,14 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
|
||||||
plt.xticks(rotation=45)
|
plt.xticks(rotation=45)
|
||||||
ax.set(ylabel='error bias', title=title)
|
ax.set(ylabel='error bias', title=title)
|
||||||
|
|
||||||
save_or_show(savepath)
|
_save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
|
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
|
||||||
vertical_xticks=False, legend=True, savepath=None):
|
vertical_xticks=False, legend=True, savepath=None):
|
||||||
"""
|
"""
|
||||||
|
Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value)
|
||||||
|
for different bins of (true) prevalence of the positive classs, for each quantification method.
|
||||||
|
|
||||||
:param method_names: array-like with the method names for each experiment
|
:param method_names: array-like with the method names for each experiment
|
||||||
:param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
:param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||||
|
@ -132,7 +134,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
||||||
:param title: the title to be displayed in the plot
|
:param title: the title to be displayed in the plot
|
||||||
:param nbins: number of bins
|
:param nbins: number of bins
|
||||||
:param colormap: the matplotlib colormap to use (default cm.tab10)
|
:param colormap: the matplotlib colormap to use (default cm.tab10)
|
||||||
:param vertical_xticks:
|
:param vertical_xticks: whether or not to add secondary grid (default is False)
|
||||||
:param legend: whether or not to display the legend (default is True)
|
:param legend: whether or not to display the legend (default is True)
|
||||||
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
||||||
"""
|
"""
|
||||||
|
@ -202,39 +204,44 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
||||||
|
|
||||||
# x-axis and y-axis labels and limits
|
# x-axis and y-axis labels and limits
|
||||||
ax.set(xlabel='prevalence', ylabel='error bias', title=title)
|
ax.set(xlabel='prevalence', ylabel='error bias', title=title)
|
||||||
# ax.set_ylim(-1, 1)
|
|
||||||
ax.set_xlim(0, 1)
|
ax.set_xlim(0, 1)
|
||||||
|
|
||||||
save_or_show(savepath)
|
_save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
def _merge(method_names, true_prevs, estim_prevs):
|
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
ndims = true_prevs[0].shape[1]
|
n_bins=20, error_name='ae', show_std=False,
|
||||||
data = defaultdict(lambda: {'true': np.empty(shape=(0, ndims)), 'estim': np.empty(shape=(0, ndims))})
|
|
||||||
method_order=[]
|
|
||||||
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
|
||||||
data[method]['true'] = np.concatenate([data[method]['true'], true_prev])
|
|
||||||
data[method]['estim'] = np.concatenate([data[method]['estim'], estim_prev])
|
|
||||||
if method not in method_order:
|
|
||||||
method_order.append(method)
|
|
||||||
true_prevs_ = [data[m]['true'] for m in method_order]
|
|
||||||
estim_prevs_ = [data[m]['estim'] for m in method_order]
|
|
||||||
return method_order, true_prevs_, estim_prevs_
|
|
||||||
|
|
||||||
|
|
||||||
def _set_colors(ax, n_methods):
|
|
||||||
NUM_COLORS = n_methods
|
|
||||||
cm = plt.get_cmap('tab20')
|
|
||||||
ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
|
|
||||||
|
|
||||||
|
|
||||||
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=False,
|
|
||||||
show_density=True,
|
show_density=True,
|
||||||
logscale=False,
|
logscale=False,
|
||||||
title=f'Quantification error as a function of distribution shift',
|
title=f'Quantification error as a function of distribution shift',
|
||||||
savepath=None,
|
|
||||||
vlines=None,
|
vlines=None,
|
||||||
method_order=None):
|
method_order=None,
|
||||||
|
savepath=None):
|
||||||
|
"""
|
||||||
|
Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift
|
||||||
|
(along the y-axis, as measured in terms of :meth:`quapy.error.ae`). This plot is useful especially for multiclass
|
||||||
|
problems, in which "diagonal plots" may be cumbersone, and in order to gain understanding about how methods
|
||||||
|
fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the
|
||||||
|
high-shift regime).
|
||||||
|
|
||||||
|
:param method_names: array-like with the method names for each experiment
|
||||||
|
:param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||||
|
each experiment
|
||||||
|
:param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
|
||||||
|
for each experiment
|
||||||
|
:param tr_prevs: training prevalence of each experiment
|
||||||
|
:param n_bins: number of bins in which the y-axis is to be divided (default is 20)
|
||||||
|
:param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae")
|
||||||
|
:param show_std: whether or not to show standard deviations as color bands (default is False)
|
||||||
|
:param show_density: whether or not to display the distribution of experiments for each bin (default is True)
|
||||||
|
:param logscale: whether or not to log-scale the y-error measure (default is False)
|
||||||
|
:param title: title of the plot (default is "Quantification error as a function of distribution shift")
|
||||||
|
:param vlines: array-like list of values (default is None). If indicated, highlights some regions of the space
|
||||||
|
using vertical dotted lines.
|
||||||
|
:param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e.,
|
||||||
|
listed in the legend and associated with matplotlib colors).
|
||||||
|
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
||||||
|
"""
|
||||||
|
|
||||||
fig, ax = plt.subplots()
|
fig, ax = plt.subplots()
|
||||||
ax.grid()
|
ax.grid()
|
||||||
|
@ -245,7 +252,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, e
|
||||||
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
||||||
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
||||||
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
||||||
data = __join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||||
|
|
||||||
_set_colors(ax, n_methods=len(method_order))
|
_set_colors(ax, n_methods=len(method_order))
|
||||||
|
|
||||||
|
@ -302,13 +309,46 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, e
|
||||||
ax.set_xlim(0, max_x)
|
ax.set_xlim(0, max_x)
|
||||||
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||||
|
|
||||||
save_or_show(savepath)
|
_save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, binning='isomerous',
|
def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
|
n_bins=20, binning='isomerous',
|
||||||
x_error='ae', y_error='ae', ttest_alpha=0.005, tail_density_threshold=0.005,
|
x_error='ae', y_error='ae', ttest_alpha=0.005, tail_density_threshold=0.005,
|
||||||
method_order=None,
|
method_order=None,
|
||||||
savepath=None):
|
savepath=None):
|
||||||
|
"""
|
||||||
|
Displays (only) the top performing methods for different regions of the train-test shift in form of a broken
|
||||||
|
bar chart, in which each method has bars only for those regions in which either one of the following conditions
|
||||||
|
hold: (i) it is the best method (in average) for the bin, or (ii) it is not statistically significantly different
|
||||||
|
(in average) as according to a two-sided t-test on independent samples at confidence `ttest_alpha`.
|
||||||
|
The binning can be made "isometric" (same size), or "isomerous" (same number of experiments -- default). A second
|
||||||
|
plot is displayed on top, that displays the distribution of experiments for each bin (when binning="isometric") or
|
||||||
|
the percentiles points of the distribution (when binning="isomerous").
|
||||||
|
|
||||||
|
:param method_names: array-like with the method names for each experiment
|
||||||
|
:param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for
|
||||||
|
each experiment
|
||||||
|
:param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components)
|
||||||
|
for each experiment
|
||||||
|
:param tr_prevs: training prevalence of each experiment
|
||||||
|
:param n_bins: number of bins in which the y-axis is to be divided (default is 20)
|
||||||
|
:param binning: type of binning, either "isomerous" (default) or "isometric"
|
||||||
|
:param x_error: a string representing the name of an error function (as defined in `quapy.error`) to be used for
|
||||||
|
measuring the amount of train-test shift (default is "ae")
|
||||||
|
:param y_error: a string representing the name of an error function (as defined in `quapy.error`) to be used for
|
||||||
|
measuring the amount of error in the prevalence estimations (default is "ae")
|
||||||
|
:param ttest_alpha: the confidence interval above which a p-value (two-sided t-test on independent samples) is
|
||||||
|
to be considered as an indicator that the two means are not statistically significantly different. Default is
|
||||||
|
0.005, meaning that a `p-value > 0.005` indicates the two methods involved are to be considered similar
|
||||||
|
:param tail_density_threshold: sets a threshold on the density of experiments (over the total number of experiments)
|
||||||
|
below which a bin in the tail (i.e., the right-most ones) will be discarded. This is in order to avoid some
|
||||||
|
bins to be shown for train-test outliers.
|
||||||
|
:param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e.,
|
||||||
|
listed in the legend and associated with matplotlib colors).
|
||||||
|
:param savepath: path where to save the plot. If not indicated (as default), the plot is shown.
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
assert binning in ['isomerous', 'isometric'], 'unknown binning type; valid types are "isomerous" and "isometric"'
|
assert binning in ['isomerous', 'isometric'], 'unknown binning type; valid types are "isomerous" and "isometric"'
|
||||||
|
|
||||||
x_error = getattr(qp.error, x_error)
|
x_error = getattr(qp.error, x_error)
|
||||||
|
@ -317,7 +357,7 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
|
||||||
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
# get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same
|
||||||
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
# order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to
|
||||||
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
# x_error function) and 'y' is the estim-test shift (computed as according to y_error)
|
||||||
data = __join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||||
|
|
||||||
if binning == 'isomerous':
|
if binning == 'isomerous':
|
||||||
# take bins containing the same amount of examples
|
# take bins containing the same amount of examples
|
||||||
|
@ -449,10 +489,30 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
|
||||||
ax.get_xaxis().set_visible(False)
|
ax.get_xaxis().set_visible(False)
|
||||||
plt.subplots_adjust(wspace=0, hspace=0)
|
plt.subplots_adjust(wspace=0, hspace=0)
|
||||||
|
|
||||||
save_or_show(savepath)
|
_save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
def save_or_show(savepath):
|
def _merge(method_names, true_prevs, estim_prevs):
|
||||||
|
ndims = true_prevs[0].shape[1]
|
||||||
|
data = defaultdict(lambda: {'true': np.empty(shape=(0, ndims)), 'estim': np.empty(shape=(0, ndims))})
|
||||||
|
method_order=[]
|
||||||
|
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||||
|
data[method]['true'] = np.concatenate([data[method]['true'], true_prev])
|
||||||
|
data[method]['estim'] = np.concatenate([data[method]['estim'], estim_prev])
|
||||||
|
if method not in method_order:
|
||||||
|
method_order.append(method)
|
||||||
|
true_prevs_ = [data[m]['true'] for m in method_order]
|
||||||
|
estim_prevs_ = [data[m]['estim'] for m in method_order]
|
||||||
|
return method_order, true_prevs_, estim_prevs_
|
||||||
|
|
||||||
|
|
||||||
|
def _set_colors(ax, n_methods):
|
||||||
|
NUM_COLORS = n_methods
|
||||||
|
cm = plt.get_cmap('tab20')
|
||||||
|
ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
|
||||||
|
|
||||||
|
|
||||||
|
def _save_or_show(savepath):
|
||||||
# if savepath is specified, then saves the plot in that path; otherwise the plot is shown
|
# if savepath is specified, then saves the plot in that path; otherwise the plot is shown
|
||||||
if savepath is not None:
|
if savepath is not None:
|
||||||
qp.util.create_parent_dir(savepath)
|
qp.util.create_parent_dir(savepath)
|
||||||
|
@ -462,7 +522,7 @@ def save_or_show(savepath):
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
def __join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order):
|
def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order):
|
||||||
data = defaultdict(lambda: {'x': np.empty(shape=(0)), 'y': np.empty(shape=(0))})
|
data = defaultdict(lambda: {'x': np.empty(shape=(0)), 'y': np.empty(shape=(0))})
|
||||||
|
|
||||||
if method_order is None:
|
if method_order is None:
|
||||||
|
|
111
quapy/util.py
111
quapy/util.py
|
@ -23,6 +23,10 @@ def map_parallel(func, args, n_jobs):
|
||||||
"""
|
"""
|
||||||
Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
|
Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
|
||||||
func is applied in two parallel processes to args[0:50] and to args[50:99]
|
func is applied in two parallel processes to args[0:50] and to args[50:99]
|
||||||
|
|
||||||
|
:param func: function to be parallelized
|
||||||
|
:param args: array-like of arguments to be passed to the function in different parallel calls
|
||||||
|
:param n_jobs: the number of workers
|
||||||
"""
|
"""
|
||||||
args = np.asarray(args)
|
args = np.asarray(args)
|
||||||
slices = _get_parallel_slices(len(args), n_jobs)
|
slices = _get_parallel_slices(len(args), n_jobs)
|
||||||
|
@ -35,10 +39,12 @@ def map_parallel(func, args, n_jobs):
|
||||||
def parallel(func, args, n_jobs):
|
def parallel(func, args, n_jobs):
|
||||||
"""
|
"""
|
||||||
A wrapper of multiprocessing:
|
A wrapper of multiprocessing:
|
||||||
Parallel(n_jobs=n_jobs)(
|
|
||||||
delayed(func)(args_i) for args_i in args
|
>>> Parallel(n_jobs=n_jobs)(
|
||||||
)
|
>>> delayed(func)(args_i) for args_i in args
|
||||||
that takes the quapy.environ variable as input silently
|
>>> )
|
||||||
|
|
||||||
|
that takes the `quapy.environ` variable as input silently
|
||||||
"""
|
"""
|
||||||
def func_dec(environ, *args):
|
def func_dec(environ, *args):
|
||||||
qp.environ = environ
|
qp.environ = environ
|
||||||
|
@ -52,8 +58,10 @@ def parallel(func, args, n_jobs):
|
||||||
def temp_seed(seed):
|
def temp_seed(seed):
|
||||||
"""
|
"""
|
||||||
Can be used in a "with" context to set a temporal seed without modifying the outer numpy's current state. E.g.:
|
Can be used in a "with" context to set a temporal seed without modifying the outer numpy's current state. E.g.:
|
||||||
with temp_seed(random_seed):
|
|
||||||
# do any computation depending on np.random functionality
|
>>> with temp_seed(random_seed):
|
||||||
|
>>> pass # do any computation depending on np.random functionality
|
||||||
|
|
||||||
:param seed: the seed to set within the "with" context
|
:param seed: the seed to set within the "with" context
|
||||||
"""
|
"""
|
||||||
state = np.random.get_state()
|
state = np.random.get_state()
|
||||||
|
@ -65,6 +73,12 @@ def temp_seed(seed):
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, archive_filename):
|
def download_file(url, archive_filename):
|
||||||
|
"""
|
||||||
|
Downloads a file from a url
|
||||||
|
|
||||||
|
:param url: the url
|
||||||
|
:param archive_filename: destination filename
|
||||||
|
"""
|
||||||
def progress(blocknum, bs, size):
|
def progress(blocknum, bs, size):
|
||||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||||
|
@ -74,31 +88,62 @@ def download_file(url, archive_filename):
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
def download_file_if_not_exists(url, archive_path):
|
def download_file_if_not_exists(url, archive_filename):
|
||||||
if os.path.exists(archive_path):
|
"""
|
||||||
|
Dowloads a function (using :meth:`download_file`) if the file does not exist.
|
||||||
|
|
||||||
|
:param url: the url
|
||||||
|
:param archive_filename: destination filename
|
||||||
|
"""
|
||||||
|
if os.path.exists(archive_filename):
|
||||||
return
|
return
|
||||||
create_if_not_exist(os.path.dirname(archive_path))
|
create_if_not_exist(os.path.dirname(archive_filename))
|
||||||
download_file(url,archive_path)
|
download_file(url, archive_filename)
|
||||||
|
|
||||||
|
|
||||||
def create_if_not_exist(path):
|
def create_if_not_exist(path):
|
||||||
|
"""
|
||||||
|
An alias to `os.makedirs(path, exist_ok=True)` that also returns the path. This is useful in cases like, e.g.:
|
||||||
|
|
||||||
|
>>> path = create_if_not_exist(os.path.join(dir, subdir, anotherdir))
|
||||||
|
|
||||||
|
:param path: path to create
|
||||||
|
:return: the path itself
|
||||||
|
"""
|
||||||
os.makedirs(path, exist_ok=True)
|
os.makedirs(path, exist_ok=True)
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def get_quapy_home():
|
def get_quapy_home():
|
||||||
|
"""
|
||||||
|
Gets the home directory of QuaPy, i.e., the directory where QuaPy saves permanent data, such as dowloaded datasets.
|
||||||
|
|
||||||
|
:return: a string representing the path
|
||||||
|
"""
|
||||||
home = os.path.join(str(Path.home()), 'quapy_data')
|
home = os.path.join(str(Path.home()), 'quapy_data')
|
||||||
os.makedirs(home, exist_ok=True)
|
os.makedirs(home, exist_ok=True)
|
||||||
return home
|
return home
|
||||||
|
|
||||||
|
|
||||||
def create_parent_dir(path):
|
def create_parent_dir(path):
|
||||||
|
"""
|
||||||
|
Creates the parent dir (if any) of a given path, if not exists. E.g., for `./path/to/file.txt`, the path `./path/to`
|
||||||
|
is created.
|
||||||
|
|
||||||
|
:param path: the path
|
||||||
|
"""
|
||||||
parentdir = Path(path).parent
|
parentdir = Path(path).parent
|
||||||
if parentdir:
|
if parentdir:
|
||||||
os.makedirs(parentdir, exist_ok=True)
|
os.makedirs(parentdir, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def save_text_file(path, text):
|
def save_text_file(path, text):
|
||||||
|
"""
|
||||||
|
Saves a text file to disk, given its full path, and creates the parent directory if missing.
|
||||||
|
|
||||||
|
:param path: path where to save the path.
|
||||||
|
:param text: text to save.
|
||||||
|
"""
|
||||||
create_parent_dir(path)
|
create_parent_dir(path)
|
||||||
with open(text, 'wt') as fout:
|
with open(text, 'wt') as fout:
|
||||||
fout.write(text)
|
fout.write(text)
|
||||||
|
@ -108,10 +153,12 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
||||||
"""
|
"""
|
||||||
Allows for fast reuse of resources that are generated only once by calling generation_func(*args). The next times
|
Allows for fast reuse of resources that are generated only once by calling generation_func(*args). The next times
|
||||||
this function is invoked, it loads the pickled resource. Example:
|
this function is invoked, it loads the pickled resource. Example:
|
||||||
def some_array(n):
|
|
||||||
return np.random.rand(n)
|
>>> def some_array(n): # a mock resource created with one parameter (`n`)
|
||||||
pickled_resource('./my_array.pkl', some_array, 10) # the resource does not exist: it is created by some_array(10)
|
>>> return np.random.rand(n)
|
||||||
pickled_resource('./my_array.pkl', some_array, 10) # the resource exists: it is loaded from './my_array.pkl'
|
>>> pickled_resource('./my_array.pkl', some_array, 10) # the resource does not exist: it is created by calling some_array(10)
|
||||||
|
>>> pickled_resource('./my_array.pkl', some_array, 10) # the resource exists; it is loaded from './my_array.pkl'
|
||||||
|
|
||||||
:param pickle_path: the path where to save (first time) and load (next times) the resource
|
:param pickle_path: the path where to save (first time) and load (next times) the resource
|
||||||
:param generation_func: the function that generates the resource, in case it does not exist in pickle_path
|
:param generation_func: the function that generates the resource, in case it does not exist in pickle_path
|
||||||
:param args: any arg that generation_func uses for generating the resources
|
:param args: any arg that generation_func uses for generating the resources
|
||||||
|
@ -130,8 +177,36 @@ def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
||||||
|
|
||||||
|
|
||||||
class EarlyStop:
|
class EarlyStop:
|
||||||
|
"""
|
||||||
|
A class implementing the early-stopping condition typically used for training neural networks.
|
||||||
|
|
||||||
|
:param patience: the number of (consecutive) times that a monitored evaluation metric (typically obtaind in a
|
||||||
|
held-out validation split) can be found to be worse than the best one obtained so far, before flagging the
|
||||||
|
stopping condition. An instance of this class is `callable`, and is to be used as follows:
|
||||||
|
|
||||||
|
>>> earlystop = EarlyStop(patience=2, lower_is_better=True)
|
||||||
|
>>> earlystop(0.9, epoch=0)
|
||||||
|
>>> earlystop(0.7, epoch=1)
|
||||||
|
>>> earlystop.IMPROVED # is True
|
||||||
|
>>> earlystop(1.0, epoch=2)
|
||||||
|
>>> earlystop.STOP # is False (patience=1)
|
||||||
|
>>> earlystop(1.0, epoch=3)
|
||||||
|
>>> earlystop.STOP # is True (patience=0)
|
||||||
|
>>> earlystop.best_epoch # is 1
|
||||||
|
>>> earlystop.best_score # is 0.7
|
||||||
|
|
||||||
|
|
||||||
|
:param lower_is_better: if True (default) the metric is to be minimized.
|
||||||
|
|
||||||
|
:ivar best_score: keeps track of the best value seen so far
|
||||||
|
:ivar best_epoch: keeps track of the epoch in which the best score was set
|
||||||
|
:ivar STOP: flag (boolean) indicating the stopping condition
|
||||||
|
:ivar IMPROVED: flag (boolean) indicating whether there was an improvement in the last call
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, patience, lower_is_better=True):
|
def __init__(self, patience, lower_is_better=True):
|
||||||
|
|
||||||
self.PATIENCE_LIMIT = patience
|
self.PATIENCE_LIMIT = patience
|
||||||
self.better = lambda a,b: a<b if lower_is_better else a>b
|
self.better = lambda a,b: a<b if lower_is_better else a>b
|
||||||
self.patience = patience
|
self.patience = patience
|
||||||
|
@ -141,6 +216,14 @@ class EarlyStop:
|
||||||
self.IMPROVED = False
|
self.IMPROVED = False
|
||||||
|
|
||||||
def __call__(self, watch_score, epoch):
|
def __call__(self, watch_score, epoch):
|
||||||
|
"""
|
||||||
|
Commits the new score found in epoch `epoch`. If the score improves over the best score found so far, then
|
||||||
|
the patiente counter gets reset. If otherwise, the patience counter is decreased, and in case it reachs 0,
|
||||||
|
the flag STOP becomes True.
|
||||||
|
|
||||||
|
:param watch_score: the new score
|
||||||
|
:param epoch: the current epoch
|
||||||
|
"""
|
||||||
self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score))
|
self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score))
|
||||||
if self.IMPROVED:
|
if self.IMPROVED:
|
||||||
self.best_score = watch_score
|
self.best_score = watch_score
|
||||||
|
|
Loading…
Reference in New Issue