1
0
Fork 0

fixing hyperparameters with prefixes, and replacing learner with classifier in aggregative quantifiers

This commit is contained in:
Alejandro Moreo Fernandez 2023-01-27 18:13:23 +01:00
parent adf799c8ec
commit f9a199d859
10 changed files with 352 additions and 318 deletions

View File

@ -6,6 +6,12 @@ merge with master, because I had to fix some problems with QuaNet due to an issu
added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have added cross_val_predict in qp.model_selection (i.e., a cross_val_predict for quantification) --would be nice to have
it parallelized it parallelized
check the OneVsAll module(s)
check the set_params de neural.py, because the separation of estimator__<param> is not implemented; see also
__check_params_colision
HDy can be customized so that the number of bins is specified, instead of explored within the fit method
Packaging: Packaging:
========================================== ==========================================

View File

@ -17,7 +17,7 @@ training, val_generator, test_generator = fetch_lequa2022(task=task)
# define the quantifier # define the quantifier
learner = CalibratedClassifierCV(LogisticRegression()) learner = CalibratedClassifierCV(LogisticRegression())
quantifier = EMQ(learner=learner) quantifier = EMQ(classifier=learner)
# model selection # model selection
param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]} param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]}

View File

@ -4,7 +4,7 @@ from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
from classification.calibration import RecalibratedClassifierBase, NBVSCalibration, \ from classification.calibration import RecalibratedProbabilisticClassifierBase, NBVSCalibration, \
BCTSCalibration BCTSCalibration
from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022 from data.datasets import LEQUA2022_SAMPLE_SIZE, fetch_lequa2022
from evaluation import evaluation_report from evaluation import evaluation_report
@ -13,7 +13,6 @@ from model_selection import GridSearchQ
import pandas as pd import pandas as pd
for task in ['T1A', 'T1B']: for task in ['T1A', 'T1B']:
for calib in ['NoCal', 'TS', 'VS', 'NBVS', 'NBTS']:
# calibration = TempScaling(verbose=False, bias_positions='all') # calibration = TempScaling(verbose=False, bias_positions='all')
@ -24,31 +23,36 @@ for task in ['T1A', 'T1B']:
# learner = BCTSCalibration(LogisticRegression(), n_jobs=-1) # learner = BCTSCalibration(LogisticRegression(), n_jobs=-1)
# learner = CalibratedClassifierCV(LogisticRegression()) # learner = CalibratedClassifierCV(LogisticRegression())
learner = LogisticRegression() learner = LogisticRegression()
quantifier = EMQ(learner=learner, exact_train_prev=False, recalib=calib.lower() if calib != 'NoCal' else None) quantifier = EMQ(classifier=learner)
# model selection # model selection
param_grid = {'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]} param_grid = {
'classifier__C': np.logspace(-3, 3, 7),
'classifier__class_weight': ['balanced', None],
'recalib': ['platt', 'ts', 'vs', 'nbvs', 'bcts', None],
'exact_train_prev': [False, True]
}
model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', n_jobs=-1, refit=False, verbose=True) model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', n_jobs=-1, refit=False, verbose=True)
quantifier = model_selection.fit(training) quantifier = model_selection.fit(training)
# evaluation # evaluation
report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True) report = evaluation_report(quantifier, protocol=test_generator, error_metrics=['mae', 'mrae', 'mkld'], verbose=True)
import os # import os
os.makedirs(f'./predictions/{task}', exist_ok=True) # os.makedirs(f'./out', exist_ok=True)
with open(f'./predictions/{task}/{calib}-EMQ.csv', 'wt') as foo: # with open(f'./out/EMQ_{calib}_{task}.txt', 'wt') as foo:
estim_prev = report['estim-prev'].values # estim_prev = report['estim-prev'].values
nclasses = len(estim_prev[0]) # nclasses = len(estim_prev[0])
foo.write(f'id,'+','.join([str(x) for x in range(nclasses)])+'\n') # foo.write(f'id,'+','.join([str(x) for x in range(nclasses)])+'\n')
for id, prev in enumerate(estim_prev): # for id, prev in enumerate(estim_prev):
foo.write(f'{id},'+','.join([f'{p:.5f}' for p in prev])+'\n') # foo.write(f'{id},'+','.join([f'{p:.5f}' for p in prev])+'\n')
#
os.makedirs(f'./errors/{task}', exist_ok=True) # #os.makedirs(f'./errors/{task}', exist_ok=True)
with open(f'./errors/{task}/{calib}-EMQ.csv', 'wt') as foo: # with open(f'./out/EMQ_{calib}_{task}_errors.txt', 'wt') as foo:
maes, mraes = report['mae'].values, report['mrae'].values # maes, mraes = report['mae'].values, report['mrae'].values
foo.write(f'id,AE,RAE\n') # foo.write(f'id,AE,RAE\n')
for id, (ae_i, rae_i) in enumerate(zip(maes, mraes)): # for id, (ae_i, rae_i) in enumerate(zip(maes, mraes)):
foo.write(f'{id},{ae_i:.5f},{rae_i:.5f}\n') # foo.write(f'{id},{ae_i:.5f},{rae_i:.5f}\n')
# printing results # printing results
pd.set_option('display.expand_frame_repr', False) pd.set_option('display.expand_frame_repr', False)

View File

@ -37,6 +37,12 @@
- new dependency "abstention" (to add to the project requirements and setup). Calibration methods from - new dependency "abstention" (to add to the project requirements and setup). Calibration methods from
https://github.com/kundajelab/abstention added. https://github.com/kundajelab/abstention added.
- the internal classifier of aggregative methods is now called "classifier" instead of "learner"
- when optimizing the hyperparameters of an aggregative quantifier, the classifier's specific hyperparameters
should be marked with a "classifier__" prefix (just like in scikit-learn), while the quantifier's specific
hyperparameters are named directly. For example, PCC(LogisticRegression()) quantifier has
Things to fix: Things to fix:
- calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.) - calibration with recalibration methods has to be fixed for exact_train_prev in EMQ (conflicts with clone, deepcopy, etc.)
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance(): - clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():

View File

@ -11,27 +11,27 @@ import numpy as np
# see https://github.com/kundajelab/abstention # see https://github.com/kundajelab/abstention
class RecalibratedClassifier: class RecalibratedProbabilisticClassifier:
pass pass
class RecalibratedClassifierBase(BaseEstimator, RecalibratedClassifier): class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabilisticClassifier):
""" """
Applies a (re)calibration method from abstention.calibration, as defined in Applies a (re)calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier :param classifier: a scikit-learn probabilistic classifier
:param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory) :param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory)
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p :param val_split: indicate an integer k for performing kFCV to obtain the posterior probabilities, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole training instances (the rest is used for training). In any case, the classifier is retrained in the whole
training set afterwards. training set afterwards.
:param n_jobs: indicate the number of parallel workers (only when val_split is an integer) :param n_jobs: indicate the number of parallel workers (only when val_split is an integer); default=None
:param verbose: whether or not to display information in the standard output :param verbose: whether or not to display information in the standard output
""" """
def __init__(self, estimator, calibrator, val_split=5, n_jobs=1, verbose=False): def __init__(self, classifier, calibrator, val_split=5, n_jobs=None, verbose=False):
self.estimator = estimator self.classifier = classifier
self.calibrator = calibrator self.calibrator = calibrator
self.val_split = val_split self.val_split = val_split
self.n_jobs = n_jobs self.n_jobs = n_jobs
@ -50,39 +50,39 @@ class RecalibratedClassifierBase(BaseEstimator, RecalibratedClassifier):
def fit_cv(self, X, y): def fit_cv(self, X, y):
posteriors = cross_val_predict( posteriors = cross_val_predict(
self.estimator, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method="predict_proba" self.classifier, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method='predict_proba'
) )
self.estimator.fit(X, y) self.classifier.fit(X, y)
nclasses = len(np.unique(y)) nclasses = len(np.unique(y))
self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[y], posterior_supplied=True) self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[y], posterior_supplied=True)
return self return self
def fit_tr_val(self, X, y): def fit_tr_val(self, X, y):
Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=self.val_split, stratify=y) Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=self.val_split, stratify=y)
self.estimator.fit(Xtr, ytr) self.classifier.fit(Xtr, ytr)
posteriors = self.estimator.predict_proba(Xva) posteriors = self.classifier.predict_proba(Xva)
nclasses = len(np.unique(yva)) nclasses = len(np.unique(yva))
self.calibrator = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True) self.calibrator = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True)
return self return self
def predict(self, X): def predict(self, X):
return self.estimator.predict(X) return self.classifier.predict(X)
def predict_proba(self, X): def predict_proba(self, X):
posteriors = self.estimator.predict_proba(X) posteriors = self.classifier.predict_proba(X)
return self.calibration_function(posteriors) return self.calibration_function(posteriors)
@property @property
def classes_(self): def classes_(self):
return self.estimator.classes_ return self.classifier.classes_
class NBVSCalibration(RecalibratedClassifierBase): class NBVSCalibration(RecalibratedProbabilisticClassifierBase):
""" """
Applies the No-Bias Vector Scaling (NBVS) calibration method from abstention.calibration, as defined in Applies the No-Bias Vector Scaling (NBVS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier :param classifier: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole training instances (the rest is used for training). In any case, the classifier is retrained in the whole
@ -91,20 +91,20 @@ class NBVSCalibration(RecalibratedClassifierBase):
:param verbose: whether or not to display information in the standard output :param verbose: whether or not to display information in the standard output
""" """
def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False): def __init__(self, classifier, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator self.classifier = classifier
self.calibrator = NoBiasVectorScaling(verbose=verbose) self.calibrator = NoBiasVectorScaling(verbose=verbose)
self.val_split = val_split self.val_split = val_split
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.verbose = verbose self.verbose = verbose
class BCTSCalibration(RecalibratedClassifierBase): class BCTSCalibration(RecalibratedProbabilisticClassifierBase):
""" """
Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from abstention.calibration, as defined in Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier :param classifier: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole training instances (the rest is used for training). In any case, the classifier is retrained in the whole
@ -113,20 +113,20 @@ class BCTSCalibration(RecalibratedClassifierBase):
:param verbose: whether or not to display information in the standard output :param verbose: whether or not to display information in the standard output
""" """
def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False): def __init__(self, classifier, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator self.classifier = classifier
self.calibrator = TempScaling(verbose=verbose, bias_positions='all') self.calibrator = TempScaling(verbose=verbose, bias_positions='all')
self.val_split = val_split self.val_split = val_split
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.verbose = verbose self.verbose = verbose
class TSCalibration(RecalibratedClassifierBase): class TSCalibration(RecalibratedProbabilisticClassifierBase):
""" """
Applies the Temperature Scaling (TS) calibration method from abstention.calibration, as defined in Applies the Temperature Scaling (TS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier :param classifier: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole training instances (the rest is used for training). In any case, the classifier is retrained in the whole
@ -135,20 +135,20 @@ class TSCalibration(RecalibratedClassifierBase):
:param verbose: whether or not to display information in the standard output :param verbose: whether or not to display information in the standard output
""" """
def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False): def __init__(self, classifier, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator self.classifier = classifier
self.calibrator = TempScaling(verbose=verbose) self.calibrator = TempScaling(verbose=verbose)
self.val_split = val_split self.val_split = val_split
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.verbose = verbose self.verbose = verbose
class VSCalibration(RecalibratedClassifierBase): class VSCalibration(RecalibratedProbabilisticClassifierBase):
""" """
Applies the Vector Scaling (VS) calibration method from abstention.calibration, as defined in Applies the Vector Scaling (VS) calibration method from abstention.calibration, as defined in
`Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_:
:param estimator: a scikit-learn probabilistic classifier :param classifier: a scikit-learn probabilistic classifier
:param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p
in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the
training instances (the rest is used for training). In any case, the classifier is retrained in the whole training instances (the rest is used for training). In any case, the classifier is retrained in the whole
@ -157,8 +157,8 @@ class VSCalibration(RecalibratedClassifierBase):
:param verbose: whether or not to display information in the standard output :param verbose: whether or not to display information in the standard output
""" """
def __init__(self, estimator, val_split=5, n_jobs=1, verbose=False): def __init__(self, classifier, val_split=5, n_jobs=1, verbose=False):
self.estimator = estimator self.classifier = classifier
self.calibrator = VectorScaling(verbose=verbose) self.calibrator = VectorScaling(verbose=verbose)
self.val_split = val_split self.val_split = val_split
self.n_jobs = n_jobs self.n_jobs = n_jobs

View File

@ -10,7 +10,7 @@ from sklearn.model_selection import StratifiedKFold, cross_val_predict
from tqdm import tqdm from tqdm import tqdm
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
from classification.calibration import RecalibratedClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \ from classification.calibration import RecalibratedProbabilisticClassifier, NBVSCalibration, BCTSCalibration, TSCalibration, \
VSCalibration VSCalibration
from quapy.classification.svmperf import SVMperf from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
@ -23,41 +23,41 @@ from quapy.method.base import BaseQuantifier, BinaryQuantifier
class AggregativeQuantifier(BaseQuantifier): class AggregativeQuantifier(BaseQuantifier):
""" """
Abstract class for quantification methods that base their estimations on the aggregation of classification Abstract class for quantification methods that base their estimations on the aggregation of classification
results. Aggregative Quantifiers thus implement a :meth:`classify` method and maintain a :attr:`learner` attribute. results. Aggregative Quantifiers thus implement a :meth:`classify` method and maintain a :attr:`classifier`
Subclasses of this abstract class must implement the method :meth:`aggregate` which computes the aggregation attribute. Subclasses of this abstract class must implement the method :meth:`aggregate` which computes the
of label predictions. The method :meth:`quantify` comes with a default implementation based on aggregation of label predictions. The method :meth:`quantify` comes with a default implementation based on
:meth:`classify` and :meth:`aggregate`. :meth:`classify` and :meth:`aggregate`.
""" """
@abstractmethod @abstractmethod
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_classifier=True):
""" """
Trains the aggregative quantifier Trains the aggregative quantifier
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
:param fit_learner: whether or not to train the learner (default is True). Set to False if the :param fit_classifier: whether or not to train the learner (default is True). Set to False if the
learner has been trained outside the quantifier. learner has been trained outside the quantifier.
:return: self :return: self
""" """
... ...
@property @property
def learner(self): def classifier(self):
""" """
Gives access to the classifier Gives access to the classifier
:return: the classifier (typically an sklearn's Estimator) :return: the classifier (typically an sklearn's Estimator)
""" """
return self.learner_ return self.classifier_
@learner.setter @classifier.setter
def learner(self, classifier): def classifier(self, classifier):
""" """
Setter for the classifier Setter for the classifier
:param classifier: the classifier :param classifier: the classifier
""" """
self.learner_ = classifier self.classifier_ = classifier
def classify(self, instances): def classify(self, instances):
""" """
@ -68,7 +68,7 @@ class AggregativeQuantifier(BaseQuantifier):
:param instances: array-like :param instances: array-like
:return: np.ndarray of shape `(n_instances,)` with label predictions :return: np.ndarray of shape `(n_instances,)` with label predictions
""" """
return self.learner.predict(instances) return self.classifier.predict(instances)
def quantify(self, instances): def quantify(self, instances):
""" """
@ -91,24 +91,24 @@ class AggregativeQuantifier(BaseQuantifier):
""" """
... ...
def get_params(self, deep=True): # def get_params(self, deep=True):
""" # """
Return the current parameters of the quantifier. # Return the current parameters of the quantifier.
#
# :param deep: for compatibility with sklearn
# :return: a dictionary of param-value pairs
# """
#
# return self.learner.get_params()
:param deep: for compatibility with sklearn # def set_params(self, **parameters):
:return: a dictionary of param-value pairs # """
""" # Set the parameters of the quantifier.
#
return self.learner.get_params() # :param parameters: dictionary of param-value pairs
# """
def set_params(self, **parameters): #
""" # self.learner.set_params(**parameters)
Set the parameters of the quantifier.
:param parameters: dictionary of param-value pairs
"""
self.learner.set_params(**parameters)
@property @property
def classes_(self): def classes_(self):
@ -118,7 +118,7 @@ class AggregativeQuantifier(BaseQuantifier):
:return: array-like :return: array-like
""" """
return self.learner.classes_ return self.classifier.classes_
class AggregativeProbabilisticQuantifier(AggregativeQuantifier): class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
@ -130,43 +130,43 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
""" """
def classify(self, instances): def classify(self, instances):
return self.learner.predict_proba(instances) return self.classifier.predict_proba(instances)
def set_params(self, **parameters): # def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV): # if isinstance(self.classifier, CalibratedClassifierCV):
if self.learner.get_params().get('base_estimator') == 'deprecated': # if self.classifier.get_params().get('base_estimator') == 'deprecated':
key_prefix = 'estimator__' # this has changed in the newer versions of sklearn # key_prefix = 'estimator__' # this has changed in the newer versions of sklearn
else: # else:
key_prefix = 'base_estimator__' # key_prefix = 'base_estimator__'
parameters = {key_prefix + k: v for k, v in parameters.items()} # parameters = {key_prefix + k: v for k, v in parameters.items()}
elif isinstance(self.learner, RecalibratedClassifier): # elif isinstance(self.classifier, RecalibratedClassifier):
parameters = {'estimator__' + k: v for k, v in parameters.items()} # parameters = {'estimator__' + k: v for k, v in parameters.items()}
#
self.learner.set_params(**parameters) # self.classifier.set_params(**parameters)
return self # return self
# Helper # Helper
# ------------------------------------ # ------------------------------------
def _ensure_probabilistic(learner): def _ensure_probabilistic(classifier):
if not hasattr(learner, 'predict_proba'): if not hasattr(classifier, 'predict_proba'):
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. ' print(f'The learner {classifier.__class__.__name__} does not seem to be probabilistic. '
f'The learner will be calibrated.') f'The learner will be calibrated.')
learner = CalibratedClassifierCV(learner, cv=5) classifier = CalibratedClassifierCV(classifier, cv=5)
return learner return classifier
def _training_helper(learner, def _training_helper(classifier,
data: LabelledCollection, data: LabelledCollection,
fit_learner: bool = True, fit_classifier: bool = True,
ensure_probabilistic=False, ensure_probabilistic=False,
val_split: Union[LabelledCollection, float] = None): val_split: Union[LabelledCollection, float] = None):
""" """
Training procedure common to all Aggregative Quantifiers. Training procedure common to all Aggregative Quantifiers.
:param learner: the learner to be fit :param classifier: the learner to be fit
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action) :param fit_classifier: whether or not to fit the learner (if False, then bypasses any action)
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
learner is not probabilistic, then a CalibratedCV instance of it is trained) learner is not probabilistic, then a CalibratedCV instance of it is trained)
:param val_split: if specified as a float, indicates the proportion of training instances that will define the :param val_split: if specified as a float, indicates the proportion of training instances that will define the
@ -175,9 +175,9 @@ def _training_helper(learner,
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
or None otherwise) to be used as a validation set for any subsequent parameter fitting or None otherwise) to be used as a validation set for any subsequent parameter fitting
""" """
if fit_learner: if fit_classifier:
if ensure_probabilistic: if ensure_probabilistic:
learner = _ensure_probabilistic(learner) classifier = _ensure_probabilistic(classifier)
if val_split is not None: if val_split is not None:
if isinstance(val_split, float): if isinstance(val_split, float):
if not (0 < val_split < 1): if not (0 < val_split < 1):
@ -193,72 +193,72 @@ def _training_helper(learner,
else: else:
train, unused = data, None train, unused = data, None
if isinstance(learner, BaseQuantifier): if isinstance(classifier, BaseQuantifier):
learner.fit(train) classifier.fit(train)
else: else:
learner.fit(*train.Xy) classifier.fit(*train.Xy)
else: else:
if ensure_probabilistic: if ensure_probabilistic:
if not hasattr(learner, 'predict_proba'): if not hasattr(classifier, 'predict_proba'):
raise AssertionError('error: the learner cannot be calibrated since fit_learner is set to False') raise AssertionError('error: the learner cannot be calibrated since fit_classifier is set to False')
unused = None unused = None
if isinstance(val_split, LabelledCollection): if isinstance(val_split, LabelledCollection):
unused = val_split unused = val_split
return learner, unused return classifier, unused
def cross_generate_predictions( def cross_generate_predictions(
data, data,
learner, classifier,
val_split, val_split,
probabilistic, probabilistic,
fit_learner, fit_classifier,
n_jobs n_jobs
): ):
n_jobs = qp.get_njobs(n_jobs) n_jobs = qp.get_njobs(n_jobs)
if isinstance(val_split, int): if isinstance(val_split, int):
assert fit_learner == True, \ assert fit_classifier == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False' 'the parameters for the adjustment cannot be estimated with kFCV with fit_classifier=False'
if probabilistic: if probabilistic:
learner = _ensure_probabilistic(learner) classifier = _ensure_probabilistic(classifier)
predict = 'predict_proba' predict = 'predict_proba'
else: else:
predict = 'predict' predict = 'predict'
y_pred = cross_val_predict(learner, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict) y_pred = cross_val_predict(classifier, *data.Xy, cv=val_split, n_jobs=n_jobs, method=predict)
class_count = data.counts() class_count = data.counts()
# fit the learner on all data # fit the learner on all data
learner.fit(*data.Xy) classifier.fit(*data.Xy)
y = data.y y = data.y
classes = data.classes_ classes = data.classes_
else: else:
learner, val_data = _training_helper( classifier, val_data = _training_helper(
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=val_split
) )
y_pred = learner.predict_proba(val_data.instances) if probabilistic else learner.predict(val_data.instances) y_pred = classifier.predict_proba(val_data.instances) if probabilistic else classifier.predict(val_data.instances)
y = val_data.labels y = val_data.labels
classes = val_data.classes_ classes = val_data.classes_
class_count = val_data.counts() class_count = val_data.counts()
return learner, y, y_pred, classes, class_count return classifier, y, y_pred, classes, class_count
def cross_generate_predictions_depr( def cross_generate_predictions_depr(
data, data,
learner, classifier,
val_split, val_split,
probabilistic, probabilistic,
fit_learner, fit_classifier,
method_name='' method_name=''
): ):
predict = learner.predict_proba if probabilistic else learner.predict predict = classifier.predict_proba if probabilistic else classifier.predict
if isinstance(val_split, int): if isinstance(val_split, int):
assert fit_learner == True, \ assert fit_classifier == True, \
'the parameters for the adjustment cannot be estimated with kFCV with fit_learner=False' 'the parameters for the adjustment cannot be estimated with kFCV with fit_classifier=False'
# kFCV estimation of parameters # kFCV estimation of parameters
y, y_ = [], [] y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split) kfcv = StratifiedKFold(n_splits=val_split)
@ -267,8 +267,8 @@ def cross_generate_predictions_depr(
pbar.set_description(f'{method_name}\tfitting fold {k}') pbar.set_description(f'{method_name}\tfitting fold {k}')
training = data.sampling_from_index(training_idx) training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx) validation = data.sampling_from_index(validation_idx)
learner, val_data = _training_helper( classifier, val_data = _training_helper(
learner, training, fit_learner, ensure_probabilistic=probabilistic, val_split=validation classifier, training, fit_classifier, ensure_probabilistic=probabilistic, val_split=validation
) )
y_.append(predict(val_data.instances)) y_.append(predict(val_data.instances))
y.append(val_data.labels) y.append(val_data.labels)
@ -278,21 +278,21 @@ def cross_generate_predictions_depr(
class_count = data.counts() class_count = data.counts()
# fit the learner on all data # fit the learner on all data
learner, _ = _training_helper( classifier, _ = _training_helper(
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=None classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=None
) )
classes = data.classes_ classes = data.classes_
else: else:
learner, val_data = _training_helper( classifier, val_data = _training_helper(
learner, data, fit_learner, ensure_probabilistic=probabilistic, val_split=val_split classifier, data, fit_classifier, ensure_probabilistic=probabilistic, val_split=val_split
) )
y_ = predict(val_data.instances) y_ = predict(val_data.instances)
y = val_data.labels y = val_data.labels
classes = val_data.classes_ classes = val_data.classes_
class_count = val_data.counts() class_count = val_data.counts()
return learner, y, y_, classes, class_count return classifier, y, y_, classes, class_count
# Methods # Methods
# ------------------------------------ # ------------------------------------
@ -301,22 +301,22 @@ class CC(AggregativeQuantifier):
The most basic Quantification method. One that simply classifies all instances and counts how many have been The most basic Quantification method. One that simply classifies all instances and counts how many have been
attributed to each of the classes in order to compute class prevalence estimates. attributed to each of the classes in order to compute class prevalence estimates.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
""" """
def __init__(self, learner: BaseEstimator): def __init__(self, classifier: BaseEstimator):
self.learner = learner self.classifier = classifier
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_classifier=True):
""" """
Trains the Classify & Count method unless `fit_learner` is False, in which case, the classifier is assumed to Trains the Classify & Count method unless `fit_classifier` is False, in which case, the classifier is assumed to
be already fit and there is nothing else to do. be already fit and there is nothing else to do.
:param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data
:param fit_learner: if False, the classifier is assumed to be fit :param fit_classifier: if False, the classifier is assumed to be fit
:return: self :return: self
""" """
self.learner, _ = _training_helper(self.learner, data, fit_learner) self.classifier, _ = _training_helper(self.classifier, data, fit_classifier)
return self return self
def aggregate(self, classif_predictions: np.ndarray): def aggregate(self, classif_predictions: np.ndarray):
@ -335,7 +335,7 @@ class ACC(AggregativeQuantifier):
the "adjusted" variant of :class:`CC`, that corrects the predictions of CC the "adjusted" variant of :class:`CC`, that corrects the predictions of CC
according to the `misclassification rates`. according to the `misclassification rates`.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@ -344,17 +344,17 @@ class ACC(AggregativeQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None): def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None):
self.learner = learner self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp.get_njobs(n_jobs) self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None):
""" """
Trains a ACC quantifier. Trains a ACC quantifier.
:param data: the training set :param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold indicating the validation set itself, or an int indicating the number `k` of folds to be used in `k`-fold
@ -365,11 +365,11 @@ class ACC(AggregativeQuantifier):
if val_split is None: if val_split is None:
val_split = self.val_split val_split = self.val_split
self.learner, y, y_, classes, class_count = cross_generate_predictions( self.classifier, y, y_, classes, class_count = cross_generate_predictions(
data, self.learner, val_split, probabilistic=False, fit_learner=fit_learner, n_jobs=self.n_jobs data, self.classifier, val_split, probabilistic=False, fit_classifier=fit_classifier, n_jobs=self.n_jobs
) )
self.cc = CC(self.learner) self.cc = CC(self.classifier)
self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_) self.Pte_cond_estim_ = self.getPteCondEstim(data.classes_, y, y_)
return self return self
@ -422,14 +422,14 @@ class PCC(AggregativeProbabilisticQuantifier):
`Probabilistic Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_, `Probabilistic Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_,
the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier. the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
""" """
def __init__(self, learner: BaseEstimator): def __init__(self, classifier: BaseEstimator):
self.learner = learner self.classifier = classifier
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_classifier=True):
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
return self return self
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
@ -441,7 +441,7 @@ class PACC(AggregativeProbabilisticQuantifier):
`Probabilistic Adjusted Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_, `Probabilistic Adjusted Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_,
the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier. the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@ -451,17 +451,17 @@ class PACC(AggregativeProbabilisticQuantifier):
:param n_jobs: number of parallel workers :param n_jobs: number of parallel workers
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None): def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None):
self.learner = learner self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp.get_njobs(n_jobs) self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None):
""" """
Trains a PACC quantifier. Trains a PACC quantifier.
:param data: the training set :param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
@ -472,11 +472,11 @@ class PACC(AggregativeProbabilisticQuantifier):
if val_split is None: if val_split is None:
val_split = self.val_split val_split = self.val_split
self.learner, y, y_, classes, class_count = cross_generate_predictions( self.classifier, y, y_, classes, class_count = cross_generate_predictions(
data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
) )
self.pcc = PCC(self.learner) self.pcc = PCC(self.classifier)
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_) self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
return self return self
@ -510,7 +510,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via
maximum-likelihood estimation, in a mutually recursive way, until convergence. maximum-likelihood estimation, in a mutually recursive way, until convergence.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence; :param exact_train_prev: set to True (default) for using, as the initial observation, the true training prevalence;
or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected or set to False for computing the training prevalence as an estimate, akin to PCC, i.e., as the expected
value of the posterior probabilities of the training instances as suggested in value of the posterior probabilities of the training instances as suggested in
@ -523,30 +523,32 @@ class EMQ(AggregativeProbabilisticQuantifier):
MAX_ITER = 1000 MAX_ITER = 1000
EPSILON = 1e-4 EPSILON = 1e-4
def __init__(self, learner: BaseEstimator, exact_train_prev=True, recalib=None): def __init__(self, classifier: BaseEstimator, exact_train_prev=True, recalib=None):
self.learner = learner self.classifier = classifier
self.exact_train_prev = exact_train_prev self.exact_train_prev = exact_train_prev
self.recalib = recalib self.recalib = recalib
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_classifier=True):
if self.recalib is not None: if self.recalib is not None:
if self.recalib == 'nbvs': if self.recalib == 'nbvs':
self.learner = NBVSCalibration(self.learner) self.classifier = NBVSCalibration(self.classifier)
elif self.recalib == 'bcts': elif self.recalib == 'bcts':
self.learner = BCTSCalibration(self.learner) self.classifier = BCTSCalibration(self.classifier)
elif self.recalib == 'ts': elif self.recalib == 'ts':
self.learner = TSCalibration(self.learner) self.classifier = TSCalibration(self.classifier)
elif self.recalib == 'vs': elif self.recalib == 'vs':
self.learner = VSCalibration(self.learner) self.classifier = VSCalibration(self.classifier)
elif self.recalib == 'platt':
self.classifier = CalibratedClassifierCV(self.classifier, ensemble=False)
else: else:
raise ValueError('invalid param argument for recalibration method; available ones are ' raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".') '"nbvs", "bcts", "ts", and "vs".')
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
if self.exact_train_prev: if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_) self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
else: else:
self.train_prevalence = qp.model_selection.cross_val_predict( self.train_prevalence = qp.model_selection.cross_val_predict(
quantifier=PCC(deepcopy(self.learner)), quantifier=PCC(deepcopy(self.classifier)),
data=data, data=data,
nfolds=3, nfolds=3,
random_state=0 random_state=0
@ -558,7 +560,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
return priors return priors
def predict_proba(self, instances, epsilon=EPSILON): def predict_proba(self, instances, epsilon=EPSILON):
classif_posteriors = self.learner.predict_proba(instances) classif_posteriors = self.classifier.predict_proba(instances)
priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon)
return posteriors return posteriors
@ -611,21 +613,21 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
class-conditional distributions of the posterior probabilities returned for the positive and negative validation class-conditional distributions of the posterior probabilities returned for the positive and negative validation
examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values. examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values.
:param learner: a sklearn's Estimator that generates a binary classifier :param classifier: a sklearn's Estimator that generates a binary classifier
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=0.4):
self.learner = learner self.classifier = classifier
self.val_split = val_split self.val_split = val_split
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
""" """
Trains a HDy quantifier. Trains a HDy quantifier.
:param data: the training set :param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a validation (e.g., 0.3 for using 30% of the training set as validation data), or a
:class:`quapy.data.base.LabelledCollection` indicating the validation set itself :class:`quapy.data.base.LabelledCollection` indicating the validation set itself
@ -635,11 +637,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
val_split = self.val_split val_split = self.val_split
self._check_binary(data, self.__class__.__name__) self._check_binary(data, self.__class__.__name__)
self.learner, validation = _training_helper( self.classifier, validation = _training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split)
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x) Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]] self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]]
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]] self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
# pre-compute the histogram for positive and negative examples # pre-compute the histogram for positive and negative examples
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110] self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
@ -684,7 +686,7 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
minimizes the distance between distributions. minimizes the distance between distributions.
Details for the ternary search have been got from <https://dl.acm.org/doi/pdf/10.1145/3219819.3220059> Details for the ternary search have been got from <https://dl.acm.org/doi/pdf/10.1145/3219819.3220059>
:param learner: a sklearn's Estimator that generates a binary classifier :param classifier: a sklearn's Estimator that generates a binary classifier
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
:param n_bins: an int with the number of bins to use to compute the histograms. :param n_bins: an int with the number of bins to use to compute the histograms.
@ -693,8 +695,8 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
:param tol: a float with the tolerance for the ternary search algorithm. :param tol: a float with the tolerance for the ternary search algorithm.
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05): def __init__(self, classifier: BaseEstimator, val_split=0.4, n_bins=8, distance: Union[str, Callable]='HD', tol=1e-05):
self.learner = learner self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.tol = tol self.tol = tol
self.distance = distance self.distance = distance
@ -724,16 +726,16 @@ class DyS(AggregativeProbabilisticQuantifier, BinaryQuantifier):
else: else:
return distance(Px_train, Px_test) return distance(Px_train, Px_test)
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None: if val_split is None:
val_split = self.val_split val_split = self.val_split
self._check_binary(data, self.__class__.__name__) self._check_binary(data, self.__class__.__name__)
self.learner, validation = _training_helper( self.classifier, validation = _training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split)
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x) Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]] self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]]
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]] self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0] self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0]
self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0] self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0]
return self return self
@ -757,25 +759,25 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
SMM is a simplification of matching distribution methods where the representation of the examples SMM is a simplification of matching distribution methods where the representation of the examples
is created using the mean instead of a histogram. is created using the mean instead of a histogram.
:param learner: a sklearn's Estimator that generates a binary classifier. :param classifier: a sklearn's Estimator that generates a binary classifier.
:param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out
validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself). validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=0.4):
self.learner = learner self.classifier = classifier
self.val_split = val_split self.val_split = val_split
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None: if val_split is None:
val_split = self.val_split val_split = self.val_split
self._check_binary(data, self.__class__.__name__) self._check_binary(data, self.__class__.__name__)
self.learner, validation = _training_helper( self.classifier, validation = _training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) self.classifier, data, fit_classifier, ensure_probabilistic=True, val_split=val_split)
Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x) Px = self.classify(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]] self.Pxy1 = Px[validation.labels == self.classifier.classes_[1]]
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]] self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
self.Pxy1_mean = np.mean(self.Pxy1) self.Pxy1_mean = np.mean(self.Pxy1)
self.Pxy0_mean = np.mean(self.Pxy0) self.Pxy0_mean = np.mean(self.Pxy0)
return self return self
@ -809,19 +811,19 @@ class ELM(AggregativeQuantifier, BinaryQuantifier):
self.svmperf_base = svmperf_base if svmperf_base is not None else qp.environ['SVMPERF_HOME'] self.svmperf_base = svmperf_base if svmperf_base is not None else qp.environ['SVMPERF_HOME']
self.loss = loss self.loss = loss
self.kwargs = kwargs self.kwargs = kwargs
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs) self.classifier = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs)
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_classifier=True):
self._check_binary(data, self.__class__.__name__) self._check_binary(data, self.__class__.__name__)
assert fit_learner, 'the method requires that fit_learner=True' assert fit_classifier, 'the method requires that fit_classifier=True'
self.learner.fit(data.instances, data.labels) self.classifier.fit(data.instances, data.labels)
return self return self
def aggregate(self, classif_predictions: np.ndarray): def aggregate(self, classif_predictions: np.ndarray):
return F.prevalence_from_labels(classif_predictions, self.classes_) return F.prevalence_from_labels(classif_predictions, self.classes_)
def classify(self, X, y=None): def classify(self, X, y=None):
return self.learner.predict(X) return self.classifier.predict(X)
class SVMQ(ELM): class SVMQ(ELM):
@ -916,7 +918,7 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
that would allow for more true positives and many more false positives, on the grounds this that would allow for more true positives and many more false positives, on the grounds this
would deliver larger denominators. would deliver larger denominators.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@ -925,22 +927,22 @@ class ThresholdOptimization(AggregativeQuantifier, BinaryQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4, n_jobs=None): def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None):
self.learner = learner self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.n_jobs = qp.get_njobs(n_jobs) self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, int, LabelledCollection] = None):
self._check_binary(data, "Threshold Optimization") self._check_binary(data, "Threshold Optimization")
if val_split is None: if val_split is None:
val_split = self.val_split val_split = self.val_split
self.learner, y, y_, classes, class_count = cross_generate_predictions( self.classifier, y, y_, classes, class_count = cross_generate_predictions(
data, self.learner, val_split, probabilistic=True, fit_learner=fit_learner, n_jobs=self.n_jobs data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
) )
self.cc = CC(self.learner) self.cc = CC(self.classifier)
self.tpr, self.fpr = self._optimize_threshold(y, y_) self.tpr, self.fpr = self._optimize_threshold(y, y_)
@ -1018,7 +1020,7 @@ class T50(ThresholdOptimization):
for the threshold that makes `tpr` cosest to 0.5. for the threshold that makes `tpr` cosest to 0.5.
The goal is to bring improved stability to the denominator of the adjustment. The goal is to bring improved stability to the denominator of the adjustment.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@ -1027,8 +1029,8 @@ class T50(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=0.4):
super().__init__(learner, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
return abs(tpr - 0.5) return abs(tpr - 0.5)
@ -1042,7 +1044,7 @@ class MAX(ThresholdOptimization):
for the threshold that maximizes `tpr-fpr`. for the threshold that maximizes `tpr-fpr`.
The goal is to bring improved stability to the denominator of the adjustment. The goal is to bring improved stability to the denominator of the adjustment.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@ -1051,8 +1053,8 @@ class MAX(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=0.4):
super().__init__(learner, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
# MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr) # MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr)
@ -1067,7 +1069,7 @@ class X(ThresholdOptimization):
for the threshold that yields `tpr=1-fpr`. for the threshold that yields `tpr=1-fpr`.
The goal is to bring improved stability to the denominator of the adjustment. The goal is to bring improved stability to the denominator of the adjustment.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@ -1076,8 +1078,8 @@ class X(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=0.4):
super().__init__(learner, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
return abs(1 - (tpr + fpr)) return abs(1 - (tpr + fpr))
@ -1091,7 +1093,7 @@ class MS(ThresholdOptimization):
class prevalence estimates for all decision thresholds and returns the median of them all. class prevalence estimates for all decision thresholds and returns the median of them all.
The goal is to bring improved stability to the denominator of the adjustment. The goal is to bring improved stability to the denominator of the adjustment.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@ -1099,8 +1101,8 @@ class MS(ThresholdOptimization):
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=0.4):
super().__init__(learner, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
pass pass
@ -1128,7 +1130,7 @@ class MS2(MS):
which `tpr-fpr>0.25` which `tpr-fpr>0.25`
The goal is to bring improved stability to the denominator of the adjustment. The goal is to bring improved stability to the denominator of the adjustment.
:param learner: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
:param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the
misclassification rates are to be estimated. misclassification rates are to be estimated.
This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of This parameter can be indicated as a real value (between 0 and 1, default 0.4), representing a proportion of
@ -1136,8 +1138,8 @@ class MS2(MS):
`k`-fold cross validation (this integer stands for the number of folds `k`), or as a `k`-fold cross validation (this integer stands for the number of folds `k`), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself). :class:`quapy.data.base.LabelledCollection` (the split itself).
""" """
def __init__(self, learner: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=0.4):
super().__init__(learner, val_split) super().__init__(classifier, val_split)
def _optimize_threshold(self, y, probabilities): def _optimize_threshold(self, y, probabilities):
tprs = [0, 1] tprs = [0, 1]
@ -1174,7 +1176,8 @@ class OneVsAll(AggregativeQuantifier):
This variant was used, along with the :class:`EMQ` quantifier, in This variant was used, along with the :class:`EMQ` quantifier, in
`Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_. `Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_.
:param learner: a sklearn's Estimator that generates a binary classifier :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a
one-vs-all manner
:param n_jobs: number of parallel workers :param n_jobs: number of parallel workers
""" """
@ -1186,11 +1189,11 @@ class OneVsAll(AggregativeQuantifier):
self.binary_quantifier = binary_quantifier self.binary_quantifier = binary_quantifier
self.n_jobs = qp.get_njobs(n_jobs) self.n_jobs = qp.get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_classifier=True):
assert not data.binary, \ assert not data.binary, \
f'{self.__class__.__name__} expect non-binary data' f'{self.__class__.__name__} expect non-binary data'
assert fit_learner == True, \ assert fit_classifier == True, \
'fit_learner must be True' 'fit_classifier must be True'
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data) self.__parallel(self._delayed_binary_fit, data)

View File

@ -1,12 +1,15 @@
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from copy import deepcopy from copy import deepcopy
from sklearn.base import BaseEstimator
import quapy as qp import quapy as qp
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
# Base Quantifier abstract class # Base Quantifier abstract class
# ------------------------------------ # ------------------------------------
class BaseQuantifier(metaclass=ABCMeta): class BaseQuantifier(BaseEstimator):
""" """
Abstract Quantifier. A quantifier is defined as an object of a class that implements the method :meth:`fit` on Abstract Quantifier. A quantifier is defined as an object of a class that implements the method :meth:`fit` on
:class:`quapy.data.base.LabelledCollection`, the method :meth:`quantify`, and the :meth:`set_params` and :class:`quapy.data.base.LabelledCollection`, the method :meth:`quantify`, and the :meth:`set_params` and
@ -33,24 +36,24 @@ class BaseQuantifier(metaclass=ABCMeta):
""" """
... ...
@abstractmethod # @abstractmethod
def set_params(self, **parameters): # def set_params(self, **parameters):
""" # """
Set the parameters of the quantifier. # Set the parameters of the quantifier.
#
:param parameters: dictionary of param-value pairs # :param parameters: dictionary of param-value pairs
""" # """
... # ...
#
@abstractmethod # @abstractmethod
def get_params(self, deep=True): # def get_params(self, deep=True):
""" # """
Return the current parameters of the quantifier. # Return the current parameters of the quantifier.
#
:param deep: for compatibility with sklearn # :param deep: for compatibility with sklearn
:return: a dictionary of param-value pairs # :return: a dictionary of param-value pairs
""" # """
... # ...
class BinaryQuantifier(BaseQuantifier): class BinaryQuantifier(BaseQuantifier):
@ -67,7 +70,7 @@ class BinaryQuantifier(BaseQuantifier):
class OneVsAllGeneric: class OneVsAllGeneric:
""" """
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.
""" """
def __init__(self, binary_quantifier, n_jobs=None): def __init__(self, binary_quantifier, n_jobs=None):
@ -103,11 +106,11 @@ class OneVsAllGeneric:
def get_params(self, deep=True): def get_params(self, deep=True):
return self.binary_quantifier.get_params() return self.binary_quantifier.get_params()
def _delayed_binary_predict(self, c, learners, X): def _delayed_binary_predict(self, c, quantifiers, X):
return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence return quantifiers[c].quantify(X)[:, 1] # the mean is the estimation for the positive class prevalence
def _delayed_binary_fit(self, c, learners, data, **kwargs): def _delayed_binary_fit(self, c, quantifiers, data, **kwargs):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
learners[c].fit(bindata, **kwargs) quantifiers[c].fit(bindata, **kwargs)

View File

@ -146,7 +146,7 @@ class Ensemble(BaseQuantifier):
This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility
with the abstract class). with the abstract class).
Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or
`Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a learner `l` optimized for `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a classifier `l` optimized for
classification (not recommended). classification (not recommended).
:param parameters: dictionary :param parameters: dictionary
@ -154,7 +154,7 @@ class Ensemble(BaseQuantifier):
""" """
raise NotImplementedError(f'{self.__class__.__name__} should not be used within GridSearchQ; ' raise NotImplementedError(f'{self.__class__.__name__} should not be used within GridSearchQ; '
f'instead, use Ensemble(GridSearchQ(q),...), with q a Quantifier (recommended), ' f'instead, use Ensemble(GridSearchQ(q),...), with q a Quantifier (recommended), '
f'or Ensemble(Q(GridSearchCV(l))) with Q a quantifier class that has a learner ' f'or Ensemble(Q(GridSearchCV(l))) with Q a quantifier class that has a classifier '
f'l optimized for classification (not recommended).') f'l optimized for classification (not recommended).')
def get_params(self, deep=True): def get_params(self, deep=True):
@ -162,7 +162,7 @@ class Ensemble(BaseQuantifier):
This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility
with the abstract class). with the abstract class).
Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or
`Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a learner `l` optimized for `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a classifier `l` optimized for
classification (not recommended). classification (not recommended).
:return: raises an Exception :return: raises an Exception
@ -326,18 +326,18 @@ def _draw_simplex(ndim, min_val, max_trials=100):
f'>= {min_val} is unlikely (it failed after {max_trials} trials)') f'>= {min_val} is unlikely (it failed after {max_trials} trials)')
def _instantiate_ensemble(learner, base_quantifier_class, param_grid, optim, param_model_sel, **kwargs): def _instantiate_ensemble(classifier, base_quantifier_class, param_grid, optim, param_model_sel, **kwargs):
if optim is None: if optim is None:
base_quantifier = base_quantifier_class(learner) base_quantifier = base_quantifier_class(classifier)
elif optim in qp.error.CLASSIFICATION_ERROR: elif optim in qp.error.CLASSIFICATION_ERROR:
if optim == qp.error.f1e: if optim == qp.error.f1e:
scoring = make_scorer(f1_score) scoring = make_scorer(f1_score)
elif optim == qp.error.acce: elif optim == qp.error.acce:
scoring = make_scorer(accuracy_score) scoring = make_scorer(accuracy_score)
learner = GridSearchCV(learner, param_grid, scoring=scoring) classifier = GridSearchCV(classifier, param_grid, scoring=scoring)
base_quantifier = base_quantifier_class(learner) base_quantifier = base_quantifier_class(classifier)
else: else:
base_quantifier = GridSearchQ(base_quantifier_class(learner), base_quantifier = GridSearchQ(base_quantifier_class(classifier),
param_grid=param_grid, param_grid=param_grid,
**param_model_sel, **param_model_sel,
error=optim) error=optim)
@ -357,7 +357,7 @@ def _check_error(error):
f'the name of an error function in {qp.error.ERROR_NAMES}') f'the name of an error function in {qp.error.ERROR_NAMES}')
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None, def ensembleFactory(classifier, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None,
**kwargs): **kwargs):
""" """
Ensemble factory. Provides a unified interface for instantiating ensembles that can be optimized (via model Ensemble factory. Provides a unified interface for instantiating ensembles that can be optimized (via model
@ -390,7 +390,7 @@ def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None,
>>> >>>
>>> ensembleFactory(LogisticRegression(), PACC, optim='mae', policy='mae', **common) >>> ensembleFactory(LogisticRegression(), PACC, optim='mae', policy='mae', **common)
:param learner: sklearn's Estimator that generates a classifier :param classifier: sklearn's Estimator that generates a classifier
:param base_quantifier_class: a class of quantifiers :param base_quantifier_class: a class of quantifiers
:param param_grid: a dictionary with the grid of parameters to optimize for :param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it :param optim: a valid quantification or classification error, or a string name of it
@ -405,21 +405,21 @@ def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None,
if param_model_sel is None: if param_model_sel is None:
raise ValueError(f'param_model_sel is None but optim was requested.') raise ValueError(f'param_model_sel is None but optim was requested.')
error = _check_error(optim) error = _check_error(optim)
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs) return _instantiate_ensemble(classifier, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs): def ECC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
""" """
Implements an ensemble of :class:`quapy.method.aggregative.CC` quantifiers, as used by Implements an ensemble of :class:`quapy.method.aggregative.CC` quantifiers, as used by
`Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
Equivalent to: Equivalent to:
>>> ensembleFactory(learner, CC, param_grid, optim, param_mod_sel, **kwargs) >>> ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details. See :meth:`ensembleFactory` for further details.
:param learner: sklearn's Estimator that generates a classifier :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for :param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it :param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to :param param_model_sel: a dictionary containing any keyworded argument to pass to
@ -428,21 +428,21 @@ def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble` :return: an instance of :class:`Ensemble`
""" """
return ensembleFactory(learner, CC, param_grid, optim, param_mod_sel, **kwargs) return ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs)
def EACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs): def EACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
""" """
Implements an ensemble of :class:`quapy.method.aggregative.ACC` quantifiers, as used by Implements an ensemble of :class:`quapy.method.aggregative.ACC` quantifiers, as used by
`Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
Equivalent to: Equivalent to:
>>> ensembleFactory(learner, ACC, param_grid, optim, param_mod_sel, **kwargs) >>> ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details. See :meth:`ensembleFactory` for further details.
:param learner: sklearn's Estimator that generates a classifier :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for :param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it :param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to :param param_model_sel: a dictionary containing any keyworded argument to pass to
@ -451,20 +451,20 @@ def EACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble` :return: an instance of :class:`Ensemble`
""" """
return ensembleFactory(learner, ACC, param_grid, optim, param_mod_sel, **kwargs) return ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs)
def EPACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs): def EPACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
""" """
Implements an ensemble of :class:`quapy.method.aggregative.PACC` quantifiers. Implements an ensemble of :class:`quapy.method.aggregative.PACC` quantifiers.
Equivalent to: Equivalent to:
>>> ensembleFactory(learner, PACC, param_grid, optim, param_mod_sel, **kwargs) >>> ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details. See :meth:`ensembleFactory` for further details.
:param learner: sklearn's Estimator that generates a classifier :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for :param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it :param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to :param param_model_sel: a dictionary containing any keyworded argument to pass to
@ -473,21 +473,21 @@ def EPACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble` :return: an instance of :class:`Ensemble`
""" """
return ensembleFactory(learner, PACC, param_grid, optim, param_mod_sel, **kwargs) return ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs)
def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs): def EHDy(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
""" """
Implements an ensemble of :class:`quapy.method.aggregative.HDy` quantifiers, as used by Implements an ensemble of :class:`quapy.method.aggregative.HDy` quantifiers, as used by
`Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
Equivalent to: Equivalent to:
>>> ensembleFactory(learner, HDy, param_grid, optim, param_mod_sel, **kwargs) >>> ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details. See :meth:`ensembleFactory` for further details.
:param learner: sklearn's Estimator that generates a classifier :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for :param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it :param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to :param param_model_sel: a dictionary containing any keyworded argument to pass to
@ -496,20 +496,20 @@ def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble` :return: an instance of :class:`Ensemble`
""" """
return ensembleFactory(learner, HDy, param_grid, optim, param_mod_sel, **kwargs) return ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs)
def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs): def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
""" """
Implements an ensemble of :class:`quapy.method.aggregative.EMQ` quantifiers. Implements an ensemble of :class:`quapy.method.aggregative.EMQ` quantifiers.
Equivalent to: Equivalent to:
>>> ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs) >>> ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
See :meth:`ensembleFactory` for further details. See :meth:`ensembleFactory` for further details.
:param learner: sklearn's Estimator that generates a classifier :param classifier: sklearn's Estimator that generates a classifier
:param param_grid: a dictionary with the grid of parameters to optimize for :param param_grid: a dictionary with the grid of parameters to optimize for
:param optim: a valid quantification or classification error, or a string name of it :param optim: a valid quantification or classification error, or a string name of it
:param param_model_sel: a dictionary containing any keyworded argument to pass to :param param_model_sel: a dictionary containing any keyworded argument to pass to
@ -518,4 +518,4 @@ def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
:return: an instance of :class:`Ensemble` :return: an instance of :class:`Ensemble`
""" """
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs) return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)

View File

@ -31,14 +31,14 @@ class QuaNetTrainer(BaseQuantifier):
>>> >>>
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer >>> # the text classifier is a CNN trained by NeuralClassifierTrainer
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) >>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
>>> learner = NeuralClassifierTrainer(cnn, device='cuda') >>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
>>> >>>
>>> # train QuaNet (QuaNet is an alias to QuaNetTrainer) >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
>>> model = QuaNet(learner, qp.environ['SAMPLE_SIZE'], device='cuda') >>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda')
>>> model.fit(dataset.training) >>> model.fit(dataset.training)
>>> estim_prevalence = model.quantify(dataset.test.instances) >>> estim_prevalence = model.quantify(dataset.test.instances)
:param learner: an object implementing `fit` (i.e., that can be trained on labelled data), :param classifier: an object implementing `fit` (i.e., that can be trained on labelled data),
`predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and `predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and
`transform` (i.e., that can generate embedded representations of the unlabelled instances). `transform` (i.e., that can generate embedded representations of the unlabelled instances).
:param sample_size: integer, the sample size :param sample_size: integer, the sample size
@ -60,7 +60,7 @@ class QuaNetTrainer(BaseQuantifier):
""" """
def __init__(self, def __init__(self,
learner, classifier,
sample_size, sample_size,
n_epochs=100, n_epochs=100,
tr_iter_per_poch=500, tr_iter_per_poch=500,
@ -76,13 +76,13 @@ class QuaNetTrainer(BaseQuantifier):
checkpointname=None, checkpointname=None,
device='cuda'): device='cuda'):
assert hasattr(learner, 'transform'), \ assert hasattr(classifier, 'transform'), \
f'the learner {learner.__class__.__name__} does not seem to be able to produce document embeddings ' \ f'the classifier {classifier.__class__.__name__} does not seem to be able to produce document embeddings ' \
f'since it does not implement the method "transform"' f'since it does not implement the method "transform"'
assert hasattr(learner, 'predict_proba'), \ assert hasattr(classifier, 'predict_proba'), \
f'the learner {learner.__class__.__name__} does not seem to be able to produce posterior probabilities ' \ f'the classifier {classifier.__class__.__name__} does not seem to be able to produce posterior probabilities ' \
f'since it does not implement the method "predict_proba"' f'since it does not implement the method "predict_proba"'
self.learner = learner self.classifier = classifier
self.sample_size = sample_size self.sample_size = sample_size
self.n_epochs = n_epochs self.n_epochs = n_epochs
self.tr_iter = tr_iter_per_poch self.tr_iter = tr_iter_per_poch
@ -105,26 +105,26 @@ class QuaNetTrainer(BaseQuantifier):
self.checkpoint = os.path.join(checkpointdir, checkpointname) self.checkpoint = os.path.join(checkpointdir, checkpointname)
self.device = torch.device(device) self.device = torch.device(device)
self.__check_params_colision(self.quanet_params, self.learner.get_params()) self.__check_params_colision(self.quanet_params, self.classifier.get_params())
self._classes_ = None self._classes_ = None
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_classifier=True):
""" """
Trains QuaNet. Trains QuaNet.
:param data: the training data on which to train QuaNet. If `fit_learner=True`, the data will be split in :param data: the training data on which to train QuaNet. If `fit_classifier=True`, the data will be split in
40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If 40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If
`fit_learner=False`, the data will be split in 66/34 for training QuaNet and validating it, respectively. `fit_classifier=False`, the data will be split in 66/34 for training QuaNet and validating it, respectively.
:param fit_learner: if True, trains the classifier on a split containing 40% of the data :param fit_classifier: if True, trains the classifier on a split containing 40% of the data
:return: self :return: self
""" """
self._classes_ = data.classes_ self._classes_ = data.classes_
os.makedirs(self.checkpointdir, exist_ok=True) os.makedirs(self.checkpointdir, exist_ok=True)
if fit_learner: if fit_classifier:
classifier_data, unused_data = data.split_stratified(0.4) classifier_data, unused_data = data.split_stratified(0.4)
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20% train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
self.learner.fit(*classifier_data.Xy) self.classifier.fit(*classifier_data.Xy)
else: else:
classifier_data = None classifier_data = None
train_data, valid_data = data.split_stratified(0.66) train_data, valid_data = data.split_stratified(0.66)
@ -133,21 +133,21 @@ class QuaNetTrainer(BaseQuantifier):
self.tr_prev = data.prevalence() self.tr_prev = data.prevalence()
# compute the posterior probabilities of the instances # compute the posterior probabilities of the instances
valid_posteriors = self.learner.predict_proba(valid_data.instances) valid_posteriors = self.classifier.predict_proba(valid_data.instances)
train_posteriors = self.learner.predict_proba(train_data.instances) train_posteriors = self.classifier.predict_proba(train_data.instances)
# turn instances' original representations into embeddings # turn instances' original representations into embeddings
valid_data_embed = LabelledCollection(self.learner.transform(valid_data.instances), valid_data.labels, self._classes_) valid_data_embed = LabelledCollection(self.classifier.transform(valid_data.instances), valid_data.labels, self._classes_)
train_data_embed = LabelledCollection(self.learner.transform(train_data.instances), train_data.labels, self._classes_) train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_)
self.quantifiers = { self.quantifiers = {
'cc': CC(self.learner).fit(None, fit_learner=False), 'cc': CC(self.classifier).fit(None, fit_classifier=False),
'acc': ACC(self.learner).fit(None, fit_learner=False, val_split=valid_data), 'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
'pcc': PCC(self.learner).fit(None, fit_learner=False), 'pcc': PCC(self.classifier).fit(None, fit_classifier=False),
'pacc': PACC(self.learner).fit(None, fit_learner=False, val_split=valid_data), 'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data),
} }
if classifier_data is not None: if classifier_data is not None:
self.quantifiers['emq'] = EMQ(self.learner).fit(classifier_data, fit_learner=False) self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False)
self.status = { self.status = {
'tr-loss': -1, 'tr-loss': -1,
@ -199,8 +199,8 @@ class QuaNetTrainer(BaseQuantifier):
return prevs_estim return prevs_estim
def quantify(self, instances): def quantify(self, instances):
posteriors = self.learner.predict_proba(instances) posteriors = self.classifier.predict_proba(instances)
embeddings = self.learner.transform(instances) embeddings = self.classifier.transform(instances)
quant_estims = self._get_aggregative_estims(posteriors) quant_estims = self._get_aggregative_estims(posteriors)
self.quanet.eval() self.quanet.eval()
with torch.no_grad(): with torch.no_grad():
@ -264,7 +264,7 @@ class QuaNetTrainer(BaseQuantifier):
f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}') f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
def get_params(self, deep=True): def get_params(self, deep=True):
return {**self.learner.get_params(), **self.quanet_params} return {**self.classifier.get_params(), **self.quanet_params}
def set_params(self, **parameters): def set_params(self, **parameters):
learner_params = {} learner_params = {}
@ -273,7 +273,7 @@ class QuaNetTrainer(BaseQuantifier):
self.quanet_params[key] = val self.quanet_params[key] = val
else: else:
learner_params[key] = val learner_params[key] = val
self.learner.set_params(**learner_params) self.classifier.set_params(**learner_params)
def __check_params_colision(self, quanet_params, learner_params): def __check_params_colision(self, quanet_params, learner_params):
quanet_keys = set(quanet_params.keys()) quanet_keys = set(quanet_params.keys())
@ -281,7 +281,7 @@ class QuaNetTrainer(BaseQuantifier):
intersection = quanet_keys.intersection(learner_keys) intersection = quanet_keys.intersection(learner_keys)
if len(intersection) > 0: if len(intersection) > 0:
raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to ' raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to '
f'the parameters of QuaNet or the learner {self.learner.__class__.__name__}') f'the parameters of QuaNet or the learner {self.classifier.__class__.__name__}')
def clean_checkpoint(self): def clean_checkpoint(self):
""" """

View File

@ -88,7 +88,12 @@ class GridSearchQ(BaseQuantifier):
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)] hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
#pass a seed to parallel so it is set in clild processes #pass a seed to parallel so it is set in clild processes
scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), seed=qp.environ.get('_R_SEED', None), n_jobs=self.n_jobs) scores = qp.util.parallel(
self._delayed_eval,
((params, training) for params in hyper),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs
)
for params, score, model in scores: for params, score, model in scores:
if score is not None: if score is not None:
@ -103,7 +108,7 @@ class GridSearchQ(BaseQuantifier):
tend = time()-tinit tend = time()-tinit
if self.best_score_ is None: if self.best_score_ is None:
raise TimeoutError('all jobs took more than the timeout time to end') raise TimeoutError('no combination of hyperparameters seem to work')
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) ' self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
f'[took {tend:.4f}s]') f'[took {tend:.4f}s]')
@ -150,6 +155,13 @@ class GridSearchQ(BaseQuantifier):
except TimeoutError: except TimeoutError:
self._sout(f'timeout ({self.timeout}s) reached for config {params}') self._sout(f'timeout ({self.timeout}s) reached for config {params}')
score = None score = None
except ValueError as e:
self._sout(f'the combination of hyperparameters {params} is invalid')
raise e
except Exception as e:
self._sout(f'something went wrong for config {params}; skipping:')
self._sout(f'\tException: {e}')
score = None
return params, score, model return params, score, model