format fix
This commit is contained in:
parent
238a30520c
commit
611d080ca6
|
@ -1,12 +1,5 @@
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from tqdm import tqdm
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
import quapy as qp
|
|
||||||
from quapy.data import LabelledCollection
|
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
from data import *
|
from data import *
|
||||||
|
@ -50,9 +43,11 @@ def gen_samples():
|
||||||
return gen_load_samples_T1(T1A_devvectors_path, nF, ground_truth_path=T1A_devprevalence_path, return_id=False)
|
return gen_load_samples_T1(T1A_devvectors_path, nF, ground_truth_path=T1A_devprevalence_path, return_id=False)
|
||||||
|
|
||||||
|
|
||||||
for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]:
|
for quantifier in [EMQ]: # [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||||
#classifier = CalibratedClassifierCV(LogisticRegression(), n_jobs=-1)
|
if quantifier == EMQ:
|
||||||
classifier = LogisticRegression()
|
classifier = CalibratedClassifierCV(LogisticRegression(), n_jobs=-1)
|
||||||
|
else:
|
||||||
|
classifier = LogisticRegression()
|
||||||
model = quantifier(classifier)
|
model = quantifier(classifier)
|
||||||
print(f'{model.__class__.__name__}: Model selection')
|
print(f'{model.__class__.__name__}: Model selection')
|
||||||
model = qp.model_selection.GridSearchQ(
|
model = qp.model_selection.GridSearchQ(
|
||||||
|
|
|
@ -2,7 +2,6 @@ import argparse
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from data import ResultSubmission, evaluate_submission
|
from data import ResultSubmission, evaluate_submission
|
||||||
import constants
|
import constants
|
||||||
import os
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
LeQua2022 Official evaluation script
|
LeQua2022 Official evaluation script
|
||||||
|
@ -20,9 +19,7 @@ def main(args):
|
||||||
print(f'MRAE: {mrae:.4f}')
|
print(f'MRAE: {mrae:.4f}')
|
||||||
|
|
||||||
if args.output is not None:
|
if args.output is not None:
|
||||||
outdir = os.path.dirname(args.output)
|
qp.util.create_parent_dir(args.output)
|
||||||
if outdir:
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
|
||||||
with open(args.output, 'wt') as foo:
|
with open(args.output, 'wt') as foo:
|
||||||
foo.write(f'MAE: {mae:.4f}\n')
|
foo.write(f'MAE: {mae:.4f}\n')
|
||||||
foo.write(f'MRAE: {mrae:.4f}\n')
|
foo.write(f'MRAE: {mrae:.4f}\n')
|
|
@ -1,11 +1,11 @@
|
||||||
import argparse
|
import argparse
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from data import ResultSubmission, evaluate_submission
|
from data import ResultSubmission
|
||||||
import constants
|
import constants
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from data import gen_load_samples_T1, load_category_map
|
from data import gen_load_samples_T1
|
||||||
from glob import glob
|
from glob import glob
|
||||||
import constants
|
import constants
|
||||||
|
|
||||||
|
@ -22,21 +22,16 @@ def main(args):
|
||||||
f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of '
|
f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of '
|
||||||
f'test samples ({constants.TEST_SAMPLES}).')
|
f'test samples ({constants.TEST_SAMPLES}).')
|
||||||
|
|
||||||
# _, categories = load_category_map(args.catmap)
|
|
||||||
|
|
||||||
# load pickled model
|
# load pickled model
|
||||||
model = pickle.load(open(args.model, 'rb'))
|
model = pickle.load(open(args.model, 'rb'))
|
||||||
|
|
||||||
# predictions
|
# predictions
|
||||||
predictions = ResultSubmission()
|
predictions = ResultSubmission()
|
||||||
for sampleid, sample in tqdm(gen_load_samples_T1(args.samples, args.nf),
|
for sampleid, sample in tqdm(gen_load_samples_T1(args.samples, args.nf), desc='predicting', total=nsamples):
|
||||||
desc='predicting', total=nsamples):
|
|
||||||
predictions.add(sampleid, model.quantify(sample))
|
predictions.add(sampleid, model.quantify(sample))
|
||||||
|
|
||||||
# saving
|
# saving
|
||||||
basedir = os.path.basename(args.output)
|
qp.util.create_parent_dir(args.output)
|
||||||
if basedir:
|
|
||||||
os.makedirs(basedir, exist_ok=True)
|
|
||||||
predictions.dump(args.output)
|
predictions.dump(args.output)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,46 @@ import inspect
|
||||||
|
|
||||||
|
|
||||||
class GridSearchQ(BaseQuantifier):
|
class GridSearchQ(BaseQuantifier):
|
||||||
|
"""Grid Search optimization targeting a quantification-oriented metric.
|
||||||
|
|
||||||
|
Optimizes the hyperparameters of a quantification method, based on an evaluation method and on an evaluation
|
||||||
|
protocol for quantification.
|
||||||
|
|
||||||
|
:param model: the quantifier to optimize
|
||||||
|
:type model: BaseQuantifier
|
||||||
|
:param param_grid: a dictionary with keys the parameter names and values the list of values to explore
|
||||||
|
:param sample_size: the size of the samples to extract from the validation set (ignored if protocl='gen')
|
||||||
|
:param protocol: either 'app' for the artificial prevalence protocol, 'npp' for the natural prevalence
|
||||||
|
protocol, or 'gen' for using a custom sampling generator function
|
||||||
|
:param n_prevpoints: if specified, indicates the number of equally distant points to extract from the interval
|
||||||
|
[0,1] in order to define the prevalences of the samples; e.g., if n_prevpoints=5, then the prevalences for
|
||||||
|
each class will be explored in [0.00, 0.25, 0.50, 0.75, 1.00]. If not specified, then eval_budget is requested.
|
||||||
|
Ignored if protocol!='app'.
|
||||||
|
:param n_repetitions: the number of repetitions for each combination of prevalences. This parameter is ignored
|
||||||
|
for the protocol='app' if eval_budget is set and is lower than the number of combinations that would be
|
||||||
|
generated using the value assigned to n_prevpoints (for the current number of classes and n_repetitions).
|
||||||
|
Ignored for protocol='npp' and protocol='gen' (use eval_budget for setting a maximum number of samples in
|
||||||
|
those cases).
|
||||||
|
:param eval_budget: if specified, sets a ceil on the number of evaluations to perform for each hyper-parameter
|
||||||
|
combination. For example, if protocol='app', there are 3 classes, n_repetitions=1 and eval_budget=20, then
|
||||||
|
n_prevpoints will be set to 5, since this will generate 15 different prevalences, i.e., [0, 0, 1],
|
||||||
|
[0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0], and since setting it to 6 would generate more than
|
||||||
|
20. When protocol='gen', indicates the maximum number of samples to generate, but less samples will be
|
||||||
|
generated if the generator yields less samples.
|
||||||
|
:param error: an error function (callable) or a string indicating the name of an error function (valid ones
|
||||||
|
are those in qp.error.QUANTIFICATION_ERROR
|
||||||
|
:param refit: whether or not to refit the model on the whole labelled collection (training+validation) with
|
||||||
|
the best chosen hyperparameter combination. Ignored if protocol='gen'
|
||||||
|
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
||||||
|
a float in [0,1] indicating the proportion of labelled data to extract from the training set, or a callable
|
||||||
|
returning a generator function each time it is invoked (only for protocol='gen').
|
||||||
|
:param n_jobs: number of parallel jobs
|
||||||
|
:param random_seed: set the seed of the random generator to replicate experiments. Ignored if protocol='gen'.
|
||||||
|
:param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested.
|
||||||
|
Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up
|
||||||
|
being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.
|
||||||
|
:param verbose: set to True to get information through the stdout
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
model: BaseQuantifier,
|
model: BaseQuantifier,
|
||||||
|
@ -27,43 +67,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
random_seed=42,
|
random_seed=42,
|
||||||
timeout=-1,
|
timeout=-1,
|
||||||
verbose=False):
|
verbose=False):
|
||||||
"""
|
|
||||||
Optimizes the hyperparameters of a quantification method, based on an evaluation method and on an evaluation
|
|
||||||
protocol for quantification.
|
|
||||||
:param model: the quantifier to optimize
|
|
||||||
:param param_grid: a dictionary with keys the parameter names and values the list of values to explore for
|
|
||||||
:param sample_size: the size of the samples to extract from the validation set (ignored if protocl='gen')
|
|
||||||
:param protocol: either 'app' for the artificial prevalence protocol, 'npp' for the natural prevalence
|
|
||||||
protocol, or 'gen' for using a custom sampling generator function
|
|
||||||
:param n_prevpoints: if specified, indicates the number of equally distant points to extract from the interval
|
|
||||||
[0,1] in order to define the prevalences of the samples; e.g., if n_prevpoints=5, then the prevalences for
|
|
||||||
each class will be explored in [0.00, 0.25, 0.50, 0.75, 1.00]. If not specified, then eval_budget is requested.
|
|
||||||
Ignored if protocol!='app'.
|
|
||||||
:param n_repetitions: the number of repetitions for each combination of prevalences. This parameter is ignored
|
|
||||||
for the protocol='app' if eval_budget is set and is lower than the number of combinations that would be
|
|
||||||
generated using the value assigned to n_prevpoints (for the current number of classes and n_repetitions).
|
|
||||||
Ignored for protocol='npp' and protocol='gen' (use eval_budget for setting a maximum number of samples in
|
|
||||||
those cases).
|
|
||||||
:param eval_budget: if specified, sets a ceil on the number of evaluations to perform for each hyper-parameter
|
|
||||||
combination. For example, if protocol='app', there are 3 classes, n_repetitions=1 and eval_budget=20, then
|
|
||||||
n_prevpoints will be set to 5, since this will generate 15 different prevalences, i.e., [0, 0, 1],
|
|
||||||
[0, 0.25, 0.75], [0, 0.5, 0.5] ... [1, 0, 0], and since setting it to 6 would generate more than
|
|
||||||
20. When protocol='gen', indicates the maximum number of samples to generate, but less samples will be
|
|
||||||
generated if the generator yields less samples.
|
|
||||||
:param error: an error function (callable) or a string indicating the name of an error function (valid ones
|
|
||||||
are those in qp.error.QUANTIFICATION_ERROR
|
|
||||||
:param refit: whether or not to refit the model on the whole labelled collection (training+validation) with
|
|
||||||
the best chosen hyperparameter combination. Ignored if protocol='gen'
|
|
||||||
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
|
||||||
a float in [0,1] indicating the proportion of labelled data to extract from the training set, or a callable
|
|
||||||
returning a generator function each time it is invoked (only for protocol='gen').
|
|
||||||
:param n_jobs: number of parallel jobs
|
|
||||||
:param random_seed: set the seed of the random generator to replicate experiments. Ignored if protocol='gen'.
|
|
||||||
:param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested.
|
|
||||||
Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up
|
|
||||||
being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set.
|
|
||||||
:param verbose: set to True to get information through the stdout
|
|
||||||
"""
|
|
||||||
self.model = model
|
self.model = model
|
||||||
self.param_grid = param_grid
|
self.param_grid = param_grid
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
|
@ -90,7 +94,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
if self.n_prevpoints != 1:
|
if self.n_prevpoints != 1:
|
||||||
print('[warning] n_prevpoints has been set and will be ignored for the selected protocol')
|
print('[warning] n_prevpoints has been set and will be ignored for the selected protocol')
|
||||||
|
|
||||||
def sout(self, msg):
|
def _sout(self, msg):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f'[{self.__class__.__name__}]: {msg}')
|
print(f'[{self.__class__.__name__}]: {msg}')
|
||||||
|
|
||||||
|
@ -145,10 +149,11 @@ class GridSearchQ(BaseQuantifier):
|
||||||
raise ValueError('unknown protocol')
|
raise ValueError('unknown protocol')
|
||||||
|
|
||||||
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
|
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float, Callable] = None):
|
||||||
"""
|
""" Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
|
||||||
|
the error metric.
|
||||||
:param training: the training set on which to optimize the hyperparameters
|
:param training: the training set on which to optimize the hyperparameters
|
||||||
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
||||||
a float in [0,1] indicating the proportion of labelled data to extract from the training set
|
a float in [0,1] indicating the proportion of labelled data to extract from the training set
|
||||||
"""
|
"""
|
||||||
if val_split is None:
|
if val_split is None:
|
||||||
val_split = self.val_split
|
val_split = self.val_split
|
||||||
|
@ -164,12 +169,12 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
if self.timeout > 0:
|
if self.timeout > 0:
|
||||||
def handler(signum, frame):
|
def handler(signum, frame):
|
||||||
self.sout('timeout reached')
|
self._sout('timeout reached')
|
||||||
raise TimeoutError()
|
raise TimeoutError()
|
||||||
|
|
||||||
signal.signal(signal.SIGALRM, handler)
|
signal.signal(signal.SIGALRM, handler)
|
||||||
|
|
||||||
self.sout(f'starting optimization with n_jobs={n_jobs}')
|
self._sout(f'starting optimization with n_jobs={n_jobs}')
|
||||||
self.param_scores_ = {}
|
self.param_scores_ = {}
|
||||||
self.best_score_ = None
|
self.best_score_ = None
|
||||||
some_timeouts = False
|
some_timeouts = False
|
||||||
|
@ -185,7 +190,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
model.fit(training)
|
model.fit(training)
|
||||||
true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
|
true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
|
||||||
score = self.error(true_prevalences, estim_prevalences)
|
score = self.error(true_prevalences, estim_prevalences)
|
||||||
self.sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
|
self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
|
||||||
if self.best_score_ is None or score < self.best_score_:
|
if self.best_score_ is None or score < self.best_score_:
|
||||||
self.best_score_ = score
|
self.best_score_ = score
|
||||||
self.best_params_ = params
|
self.best_params_ = params
|
||||||
|
@ -201,15 +206,19 @@ class GridSearchQ(BaseQuantifier):
|
||||||
if self.best_score_ is None and some_timeouts:
|
if self.best_score_ is None and some_timeouts:
|
||||||
raise TimeoutError('all jobs took more than the timeout time to end')
|
raise TimeoutError('all jobs took more than the timeout time to end')
|
||||||
|
|
||||||
self.sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
|
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
|
||||||
|
|
||||||
if self.refit:
|
if self.refit:
|
||||||
self.sout(f'refitting on the whole development set')
|
self._sout(f'refitting on the whole development set')
|
||||||
self.best_model_.fit(training + val_split)
|
self.best_model_.fit(training + val_split)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, instances):
|
def quantify(self, instances):
|
||||||
|
"""Estimate class prevalence values
|
||||||
|
|
||||||
|
:param instances: sample contanining the instances
|
||||||
|
"""
|
||||||
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
||||||
return self.best_model().quantify(instances)
|
return self.best_model().quantify(instances)
|
||||||
|
|
||||||
|
@ -218,9 +227,18 @@ class GridSearchQ(BaseQuantifier):
|
||||||
return self.best_model().classes_
|
return self.best_model().classes_
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
|
"""Sets the hyper-parameters to explore.
|
||||||
|
|
||||||
|
:param parameters: a dictionary with keys the parameter names and values the list of values to explore
|
||||||
|
"""
|
||||||
self.param_grid = parameters
|
self.param_grid = parameters
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
|
"""Returns the dictionary of hyper-parameters to explore (`param_grid`)
|
||||||
|
|
||||||
|
:param deep: Unused
|
||||||
|
:return: the dictionary `param_grid`
|
||||||
|
"""
|
||||||
return self.param_grid
|
return self.param_grid
|
||||||
|
|
||||||
def best_model(self):
|
def best_model(self):
|
||||||
|
|
|
@ -11,13 +11,12 @@ import numpy as np
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
|
||||||
|
|
||||||
def get_parallel_slices(n_tasks, n_jobs=-1):
|
def _get_parallel_slices(n_tasks, n_jobs=-1):
|
||||||
if n_jobs == -1:
|
if n_jobs == -1:
|
||||||
n_jobs = multiprocessing.cpu_count()
|
n_jobs = multiprocessing.cpu_count()
|
||||||
batch = int(n_tasks / n_jobs)
|
batch = int(n_tasks / n_jobs)
|
||||||
remainder = n_tasks % n_jobs
|
remainder = n_tasks % n_jobs
|
||||||
return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in
|
return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
|
||||||
range(n_jobs)]
|
|
||||||
|
|
||||||
|
|
||||||
def map_parallel(func, args, n_jobs):
|
def map_parallel(func, args, n_jobs):
|
||||||
|
@ -26,7 +25,7 @@ def map_parallel(func, args, n_jobs):
|
||||||
func is applied in two parallel processes to args[0:50] and to args[50:99]
|
func is applied in two parallel processes to args[0:50] and to args[50:99]
|
||||||
"""
|
"""
|
||||||
args = np.asarray(args)
|
args = np.asarray(args)
|
||||||
slices = get_parallel_slices(len(args), n_jobs)
|
slices = _get_parallel_slices(len(args), n_jobs)
|
||||||
results = Parallel(n_jobs=n_jobs)(
|
results = Parallel(n_jobs=n_jobs)(
|
||||||
delayed(func)(args[slice_i]) for slice_i in slices
|
delayed(func)(args[slice_i]) for slice_i in slices
|
||||||
)
|
)
|
||||||
|
@ -37,7 +36,7 @@ def parallel(func, args, n_jobs):
|
||||||
"""
|
"""
|
||||||
A wrapper of multiprocessing:
|
A wrapper of multiprocessing:
|
||||||
Parallel(n_jobs=n_jobs)(
|
Parallel(n_jobs=n_jobs)(
|
||||||
delayed(func)(args_i) for args_i in args
|
delayed(func)(args_i) for args_i in args
|
||||||
)
|
)
|
||||||
that takes the quapy.environ variable as input silently
|
that takes the quapy.environ variable as input silently
|
||||||
"""
|
"""
|
||||||
|
@ -49,9 +48,14 @@ def parallel(func, args, n_jobs):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def temp_seed(seed):
|
def temp_seed(seed):
|
||||||
|
"""
|
||||||
|
Can be used in a "with" context to set a temporal seed without modifying the outer numpy's current state. E.g.:
|
||||||
|
with temp_seed(random_seed):
|
||||||
|
# do any computation depending on np.random functionality
|
||||||
|
:param seed: the seed to set within the "with" context
|
||||||
|
"""
|
||||||
state = np.random.get_state()
|
state = np.random.get_state()
|
||||||
np.random.seed(seed)
|
np.random.seed(seed)
|
||||||
try:
|
try:
|
||||||
|
@ -88,10 +92,30 @@ def get_quapy_home():
|
||||||
|
|
||||||
|
|
||||||
def create_parent_dir(path):
|
def create_parent_dir(path):
|
||||||
os.makedirs(Path(path).parent, exist_ok=True)
|
parentdir = Path(path).parent
|
||||||
|
if parentdir:
|
||||||
|
os.makedirs(parentdir, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def save_text_file(path, text):
|
||||||
|
create_parent_dir(path)
|
||||||
|
with open(text, 'wt') as fout:
|
||||||
|
fout.write(text)
|
||||||
|
|
||||||
|
|
||||||
def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
||||||
|
"""
|
||||||
|
Allows for fast reuse of resources that are generated only once by calling generation_func(*args). The next times
|
||||||
|
this function is invoked, it loads the pickled resource. Example:
|
||||||
|
def some_array(n):
|
||||||
|
return np.random.rand(n)
|
||||||
|
pickled_resource('./my_array.pkl', some_array, 10) # the resource does not exist: it is created by some_array(10)
|
||||||
|
pickled_resource('./my_array.pkl', some_array, 10) # the resource exists: it is loaded from './my_array.pkl'
|
||||||
|
:param pickle_path: the path where to save (first time) and load (next times) the resource
|
||||||
|
:param generation_func: the function that generates the resource, in case it does not exist in pickle_path
|
||||||
|
:param args: any arg that generation_func uses for generating the resources
|
||||||
|
:return: the resource
|
||||||
|
"""
|
||||||
if pickle_path is None:
|
if pickle_path is None:
|
||||||
return generation_func(*args)
|
return generation_func(*args)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue