refactoring aggregative methods as methods that not only implement 'classify' and 'quantify', but that also implement 'aggregate' and that, by default, have a default implementation of 'quantify' as a pipeline of 'classify' and 'aggregate'; this helps speeding up evaluations A LOT, since the documents can be pre-classified and the samples are carried out across pre-classified values (labels, or posterior probabilities), and thus only aggregate is called many times within the artificial sampling protocol
This commit is contained in:
parent
e55caf82fd
commit
c8a1a70c8a
2
TODO.txt
2
TODO.txt
|
@ -7,5 +7,5 @@ Add readers for typical datasets used in Quantification
|
||||||
Add NAE, NRAE
|
Add NAE, NRAE
|
||||||
Add "measures for evaluating ordinal"?
|
Add "measures for evaluating ordinal"?
|
||||||
Document methods with paper references
|
Document methods with paper references
|
||||||
The parallel training in svmperf seems not to work
|
The parallel training in svmperf seems not to work (not sure...)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from data import LabelledCollection
|
from data import LabelledCollection
|
||||||
|
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier
|
||||||
from method.base import BaseQuantifier
|
from method.base import BaseQuantifier
|
||||||
from utils.util import temp_seed
|
from utils.util import temp_seed
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -10,8 +11,8 @@ def artificial_sampling_prediction(
|
||||||
model: BaseQuantifier,
|
model: BaseQuantifier,
|
||||||
test: LabelledCollection,
|
test: LabelledCollection,
|
||||||
sample_size,
|
sample_size,
|
||||||
prevalence_points=21,
|
n_prevpoints=210,
|
||||||
point_repetitions=1,
|
n_repetitions=1,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
random_seed=42):
|
random_seed=42):
|
||||||
"""
|
"""
|
||||||
|
@ -19,27 +20,40 @@ def artificial_sampling_prediction(
|
||||||
:param model: the model in charge of generating the class prevalence estimations
|
:param model: the model in charge of generating the class prevalence estimations
|
||||||
:param test: the test set on which to perform arificial sampling
|
:param test: the test set on which to perform arificial sampling
|
||||||
:param sample_size: the size of the samples
|
:param sample_size: the size of the samples
|
||||||
:param prevalence_points: the number of different prevalences to sample
|
:param n_prevpoints: the number of different prevalences to sample
|
||||||
:param point_repetitions: the number of repetitions for each prevalence
|
:param n_repetitions: the number of repetitions for each prevalence
|
||||||
:param n_jobs: number of jobs to be run in parallel
|
:param n_jobs: number of jobs to be run in parallel
|
||||||
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
||||||
any other random process.
|
any other random process.
|
||||||
:return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the
|
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
|
||||||
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
||||||
containing the the prevalences estimations
|
containing the the prevalences estimations
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with temp_seed(random_seed):
|
with temp_seed(random_seed):
|
||||||
indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions))
|
indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
|
||||||
|
|
||||||
|
if isinstance(model, AggregativeQuantifier):
|
||||||
|
quantification_func = model.aggregate
|
||||||
|
if isinstance(model, AggregativeProbabilisticQuantifier):
|
||||||
|
print('\tpreclassifying with soft')
|
||||||
|
preclassified_instances = model.posterior_probabilities(test.instances)
|
||||||
|
else:
|
||||||
|
print('\tpreclassifying with hard')
|
||||||
|
preclassified_instances = model.classify(test.instances)
|
||||||
|
test = LabelledCollection(preclassified_instances, test.labels)
|
||||||
|
else:
|
||||||
|
quantification_func = model.quantify
|
||||||
|
print('not an aggregative')
|
||||||
|
|
||||||
def _predict_prevalences(index):
|
def _predict_prevalences(index):
|
||||||
sample = test.sampling_from_index(index)
|
sample = test.sampling_from_index(index)
|
||||||
true_prevalence = sample.prevalence()
|
true_prevalence = sample.prevalence()
|
||||||
estim_prevalence = model.quantify(sample.instances)
|
estim_prevalence = quantification_func(sample.instances)
|
||||||
return true_prevalence, estim_prevalence
|
return true_prevalence, estim_prevalence
|
||||||
|
|
||||||
results = Parallel(n_jobs=n_jobs)(
|
results = Parallel(n_jobs=n_jobs)(
|
||||||
delayed(_predict_prevalences)(index) for index in tqdm(indexes)
|
delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting')
|
||||||
)
|
)
|
||||||
|
|
||||||
true_prevalences, estim_prevalences = zip(*results)
|
true_prevalences, estim_prevalences = zip(*results)
|
||||||
|
|
|
@ -36,6 +36,8 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
||||||
|
|
||||||
|
|
||||||
def prevalence_from_labels(labels, n_classes):
|
def prevalence_from_labels(labels, n_classes):
|
||||||
|
if labels.ndim != 1:
|
||||||
|
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
||||||
unique, counts = np.unique(labels, return_counts=True)
|
unique, counts = np.unique(labels, return_counts=True)
|
||||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||||
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
|
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
|
||||||
|
@ -44,6 +46,8 @@ def prevalence_from_labels(labels, n_classes):
|
||||||
|
|
||||||
|
|
||||||
def prevalence_from_probabilities(posteriors, binarize: bool = False):
|
def prevalence_from_probabilities(posteriors, binarize: bool = False):
|
||||||
|
if posteriors.ndim != 2:
|
||||||
|
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
|
||||||
if binarize:
|
if binarize:
|
||||||
predictions = np.argmax(posteriors, axis=-1)
|
predictions = np.argmax(posteriors, axis=-1)
|
||||||
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
|
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
|
||||||
|
@ -78,15 +82,15 @@ def normalize_prevalence(prevalences):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
|
def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1):
|
||||||
"""
|
"""
|
||||||
Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant
|
Computes the number of prevalence combinations in the n_classes-dimensional simplex if nprevpoints equally distant
|
||||||
prevalences are generated and nrepeats repetitions are requested
|
prevalences are generated and n_repeats repetitions are requested
|
||||||
:param nclasses: number of classes
|
:param n_classes: number of classes
|
||||||
:param nprevpoints: number of prevalence points.
|
:param n_prevpoints: number of prevalence points.
|
||||||
:param nrepeats: number of repetitions for each prevalence combination
|
:param n_repeats: number of repetitions for each prevalence combination
|
||||||
:return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number
|
:return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the
|
||||||
of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
|
number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
|
||||||
"""
|
"""
|
||||||
__cache={}
|
__cache={}
|
||||||
def __f(nc,np):
|
def __f(nc,np):
|
||||||
|
@ -98,25 +102,25 @@ def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
|
||||||
x = sum([__f(nc-1, np-i) for i in range(np)])
|
x = sum([__f(nc-1, np-i) for i in range(np)])
|
||||||
__cache[(nc,np)] = x
|
__cache[(nc,np)] = x
|
||||||
return x
|
return x
|
||||||
return __f(nclasses, nprevpoints) * nrepeats
|
return __f(n_classes, n_prevpoints) * n_repeats
|
||||||
|
|
||||||
|
|
||||||
def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget):
|
def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1):
|
||||||
"""
|
"""
|
||||||
Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that
|
Searches for the largest number of (equidistant) prevalence points to define for each of the n_classes classes so that
|
||||||
the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional
|
the number of valid prevalences generated as combinations of prevalence points (points in a n_classes-dimensional
|
||||||
simplex) do not exceed combinations_budget.
|
simplex) do not exceed combinations_budget.
|
||||||
:param nclasses: number of classes
|
:param n_classes: number of classes
|
||||||
:param nrepeats: number of repetitions for each prevalence combination
|
:param n_repeats: number of repetitions for each prevalence combination
|
||||||
:param combinations_budget: maximum number of combinatios allowed
|
:param combinations_budget: maximum number of combinatios allowed
|
||||||
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
|
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
|
||||||
"""
|
"""
|
||||||
assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers'
|
assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers'
|
||||||
nprevpoints = 1
|
n_prevpoints = 1
|
||||||
while True:
|
while True:
|
||||||
combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats)
|
combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats)
|
||||||
if combinations > combinations_budget:
|
if combinations > combinations_budget:
|
||||||
return nprevpoints-1
|
return n_prevpoints-1
|
||||||
else:
|
else:
|
||||||
nprevpoints+=1
|
n_prevpoints += 1
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ AGGREGATIVE_METHODS = {
|
||||||
agg.AdjustedClassifyAndCount,
|
agg.AdjustedClassifyAndCount,
|
||||||
agg.ProbabilisticClassifyAndCount,
|
agg.ProbabilisticClassifyAndCount,
|
||||||
agg.ProbabilisticAdjustedClassifyAndCount,
|
agg.ProbabilisticAdjustedClassifyAndCount,
|
||||||
agg.ExplicitLossMinimisation,
|
agg.ExplicitLossMinimisationBinary,
|
||||||
agg.ExpectationMaximizationQuantifier,
|
agg.ExpectationMaximizationQuantifier,
|
||||||
agg.HellingerDistanceY
|
agg.HellingerDistanceY
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,6 +34,13 @@ class AggregativeQuantifier(BaseQuantifier):
|
||||||
def classify(self, instances):
|
def classify(self, instances):
|
||||||
return self.learner.predict(instances)
|
return self.learner.predict(instances)
|
||||||
|
|
||||||
|
def quantify(self, instances, *args):
|
||||||
|
classif_predictions = self.classify(instances)
|
||||||
|
return self.aggregate(classif_predictions, *args)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def aggregate(self, classif_predictions:np.ndarray, *args): ...
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
return self.learner.get_params()
|
return self.learner.get_params()
|
||||||
|
|
||||||
|
@ -53,13 +60,17 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
||||||
"""
|
"""
|
||||||
Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities
|
Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities
|
||||||
as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative
|
as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative
|
||||||
Quantifiersimplement by implementing a _soft_classify_ method returning values in [0,1] -- the posterior
|
Quantifiersimplement by implementing a _posterior_probabilities_ method returning values in [0,1] -- the posterior
|
||||||
probabilities.
|
probabilities.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def soft_classify(self, data):
|
def posterior_probabilities(self, data):
|
||||||
return self.learner.predict_proba(data)
|
return self.learner.predict_proba(data)
|
||||||
|
|
||||||
|
def quantify(self, instances, *args):
|
||||||
|
classif_posteriors = self.posterior_probabilities(instances)
|
||||||
|
return self.aggregate(classif_posteriors, *args)
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
if isinstance(self.learner, CalibratedClassifierCV):
|
if isinstance(self.learner, CalibratedClassifierCV):
|
||||||
parameters={'base_estimator__'+k:v for k,v in parameters.items()}
|
parameters={'base_estimator__'+k:v for k,v in parameters.items()}
|
||||||
|
@ -128,9 +139,8 @@ class ClassifyAndCount(AggregativeQuantifier):
|
||||||
self.learner, _ = training_helper(self.learner, data, fit_learner)
|
self.learner, _ = training_helper(self.learner, data, fit_learner)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, instances, *args):
|
def aggregate(self, classif_predictions, *args):
|
||||||
classification = self.classify(instances) # classify
|
return F.prevalence_from_labels(classif_predictions, self.n_classes)
|
||||||
return F.prevalence_from_labels(classification, self.n_classes) # & count
|
|
||||||
|
|
||||||
|
|
||||||
class AdjustedClassifyAndCount(AggregativeQuantifier):
|
class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
|
@ -141,17 +151,24 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
||||||
self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split)
|
self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split)
|
||||||
self.cc = ClassifyAndCount(self.learner)
|
self.cc = ClassifyAndCount(self.learner)
|
||||||
y_ = self.cc.classify(validation.instances)
|
y_ = self.classify(validation.instances)
|
||||||
y = validation.labels
|
y = validation.labels
|
||||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||||
# document that belongs to yj ends up being classified as belonging to yi
|
# document that belongs to yj ends up being classified as belonging to yi
|
||||||
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
|
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, instances, *args):
|
def classify(self, data):
|
||||||
prevs_estim = self.cc.quantify(instances)
|
return self.cc.classify(data)
|
||||||
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
|
|
||||||
A = self.Pte_cond_estim_
|
def aggregate(self, classif_predictions, *args):
|
||||||
|
prevs_estim = self.cc.aggregate(classif_predictions)
|
||||||
|
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def solve_adjustment(cls, PteCondEstim, prevs_estim):
|
||||||
|
# solve for the linear system Ax = B with A=PteCondEstim and B = prevs_estim
|
||||||
|
A = PteCondEstim
|
||||||
B = prevs_estim
|
B = prevs_estim
|
||||||
try:
|
try:
|
||||||
adjusted_prevs = np.linalg.solve(A, B)
|
adjusted_prevs = np.linalg.solve(A, B)
|
||||||
|
@ -161,9 +178,6 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
adjusted_prevs = prevs_estim # no way to adjust them!
|
adjusted_prevs = prevs_estim # no way to adjust them!
|
||||||
return adjusted_prevs
|
return adjusted_prevs
|
||||||
|
|
||||||
def classify(self, data):
|
|
||||||
return self.cc.classify(data)
|
|
||||||
|
|
||||||
|
|
||||||
class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
def __init__(self, learner):
|
def __init__(self, learner):
|
||||||
|
@ -173,13 +187,11 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, instances, *args):
|
def aggregate(self, classif_posteriors, *args):
|
||||||
posteriors = self.soft_classify(instances) # classify
|
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
|
||||||
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
|
|
||||||
return prevalences
|
|
||||||
|
|
||||||
|
|
||||||
class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
|
class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
def __init__(self, learner):
|
def __init__(self, learner):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
@ -189,28 +201,23 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split
|
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split
|
||||||
)
|
)
|
||||||
self.pcc = ProbabilisticClassifyAndCount(self.learner)
|
self.pcc = ProbabilisticClassifyAndCount(self.learner)
|
||||||
y_ = self.pcc.classify(validation.instances)
|
y_ = self.classify(validation.instances)
|
||||||
y = validation.labels
|
y = validation.labels
|
||||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||||
# document that belongs to yj ends up being classified as belonging to yi
|
# document that belongs to yj ends up being classified as belonging to yi
|
||||||
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
|
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, instances, *args):
|
def aggregate(self, classif_posteriors, *args):
|
||||||
prevs_estim = self.pcc.quantify(instances)
|
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
||||||
A = self.Pte_cond_estim_
|
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
||||||
B = prevs_estim
|
|
||||||
try:
|
|
||||||
adjusted_prevs = np.linalg.solve(A, B)
|
|
||||||
adjusted_prevs = np.clip(adjusted_prevs, 0, 1)
|
|
||||||
adjusted_prevs /= adjusted_prevs.sum()
|
|
||||||
except np.linalg.LinAlgError:
|
|
||||||
adjusted_prevs = prevs_estim # no way to adjust them!
|
|
||||||
return adjusted_prevs
|
|
||||||
|
|
||||||
def classify(self, data):
|
def classify(self, data):
|
||||||
return self.pcc.classify(data)
|
return self.pcc.classify(data)
|
||||||
|
|
||||||
|
def soft_classify(self, data):
|
||||||
|
return self.pcc.posterior_probabilities(data)
|
||||||
|
|
||||||
|
|
||||||
class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
@ -226,10 +233,8 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
|
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, X, epsilon=EPSILON):
|
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||||
tr_prev=self.train_prevalence
|
return self.EM(self.train_prevalence, classif_posteriors, self.verbose, epsilon)
|
||||||
posteriors = self.soft_classify(X)
|
|
||||||
return self.EM(tr_prev, posteriors, self.verbose, epsilon)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def EM(cls, tr_prev, posterior_probabilities, verbose=False, epsilon=EPSILON):
|
def EM(cls, tr_prev, posterior_probabilities, verbose=False, epsilon=EPSILON):
|
||||||
|
@ -277,17 +282,17 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
||||||
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
||||||
self.learner, validation = training_helper(
|
self.learner, validation = training_helper(
|
||||||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
||||||
Px = self.soft_classify(validation.instances)
|
Px = self.posterior_probabilities(validation.instances)
|
||||||
self.Pxy1 = Px[validation.labels == 1]
|
self.Pxy1 = Px[validation.labels == 1]
|
||||||
self.Pxy0 = Px[validation.labels == 0]
|
self.Pxy0 = Px[validation.labels == 0]
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, instances, *args):
|
def aggregate(self, classif_posteriors, *args):
|
||||||
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
|
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
|
||||||
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
||||||
# (González-Castro, et al., 2013).
|
# (González-Castro, et al., 2013).
|
||||||
|
|
||||||
Px = self.soft_classify(instances)
|
Px = classif_posteriors
|
||||||
|
|
||||||
prev_estimations = []
|
prev_estimations = []
|
||||||
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
||||||
|
@ -318,71 +323,87 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, binary_method, n_jobs=-1):
|
def __init__(self, binary_quantifier, n_jobs=-1):
|
||||||
self.binary_method = binary_method
|
self.binary_quantifier = binary_quantifier
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, **kwargs):
|
def fit(self, data: LabelledCollection, **kwargs):
|
||||||
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
assert not data.binary, \
|
||||||
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
f'{self.__class__.__name__} expect non-binary data'
|
||||||
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
assert isinstance(self.binary_quantifier, BaseQuantifier), \
|
||||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
f'{self.binary_quantifier} does not seem to be a Quantifier'
|
||||||
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
|
||||||
)
|
self.__parallel(self._delayed_binary_fit, data, **kwargs)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, X, *args):
|
def classify(self, instances):
|
||||||
prevalences = np.asarray(
|
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
|
||||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
return classif_predictions_bin.T
|
||||||
delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
|
|
||||||
)
|
def aggregate(self, classif_predictions_bin, *args):
|
||||||
)
|
assert set(np.unique(classif_predictions_bin)) == {0,1}, \
|
||||||
<<<<<<< HEAD
|
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
|
||||||
=======
|
'predictions for each document (row) and class (columns)'
|
||||||
print('one vs all: ', prevalences)
|
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
|
||||||
>>>>>>> 2361186a01c53e744f4291e2e2299700216ff139
|
|
||||||
return F.normalize_prevalence(prevalences)
|
return F.normalize_prevalence(prevalences)
|
||||||
|
|
||||||
|
def quantify(self, X, *args):
|
||||||
|
prevalences = self.__parallel(self._delayed_binary_quantify, X)
|
||||||
|
return F.normalize_prevalence(prevalences)
|
||||||
|
|
||||||
|
def __parallel(self, func, *args, **kwargs):
|
||||||
|
return np.asarray(
|
||||||
|
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||||
|
delayed(func)(c, *args, **kwargs) for c in self.classes
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def classes(self):
|
def classes(self):
|
||||||
return sorted(self.class_method.keys())
|
return sorted(self.dict_binary_quantifiers.keys())
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
self.binary_method.set_params(**parameters)
|
self.binary_quantifier.set_params(**parameters)
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
return self.binary_method.get_params()
|
return self.binary_quantifier.get_params()
|
||||||
|
|
||||||
def _delayed_binary_predict(self, c, learners, X):
|
def _delayed_binary_classification(self, c, X):
|
||||||
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
|
return self.dict_binary_quantifiers[c].classify(X)
|
||||||
|
|
||||||
def _delayed_binary_fit(self, c, learners, data, **kwargs):
|
def _delayed_binary_quantify(self, c, X):
|
||||||
|
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence
|
||||||
|
|
||||||
|
def _delayed_binary_aggregate(self, c, classif_predictions):
|
||||||
|
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
|
||||||
|
|
||||||
|
def _delayed_binary_fit(self, c, data, **kwargs):
|
||||||
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||||
learners[c].fit(bindata, **kwargs)
|
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class ExplicitLossMinimisation(AggregativeQuantifier):
|
# class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||||
"""
|
# """
|
||||||
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
# A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
||||||
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
# quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
||||||
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
# This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||||
Social Network Analysis and Mining6(19), 1–22 (2016)
|
# Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||||
"""
|
# """
|
||||||
|
#
|
||||||
def __init__(self, svmperf_base, loss, **kwargs):
|
# def __init__(self, svmperf_base, loss, **kwargs):
|
||||||
self.svmperf_base = svmperf_base
|
# self.svmperf_base = svmperf_base
|
||||||
self.loss = loss
|
# self.loss = loss
|
||||||
self.kwargs = kwargs
|
# self.kwargs = kwargs
|
||||||
|
#
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
# def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||||
assert fit_learner, 'the method requires that fit_learner=True'
|
# assert fit_learner, 'the method requires that fit_learner=True'
|
||||||
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
# self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||||
if not data.binary:
|
# if not data.binary:
|
||||||
self.learner = OneVsAll(self.learner, n_jobs=-1)
|
# self.learner = OneVsAll(self.learner, n_jobs=-1)
|
||||||
return self.learner.fit(data, *args)
|
# return self.learner.fit(data, *args)
|
||||||
|
#
|
||||||
def quantify(self, instances, *args):
|
# def aggregate(self, instances, *args):
|
||||||
return self.learner.quantify(instances, *args)
|
# return self.learner.aggregate(instances, *args)
|
||||||
|
|
||||||
|
|
||||||
class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
||||||
|
@ -398,38 +419,35 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
||||||
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
|
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, X, y=None):
|
def aggregate(self, classif_predictions:np.ndarray, *args):
|
||||||
predictions = self.learner.predict(X)
|
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_)
|
||||||
prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
|
|
||||||
print('binary: ', prev)
|
|
||||||
return prev
|
|
||||||
|
|
||||||
def classify(self, X, y=None):
|
def classify(self, X, y=None):
|
||||||
return self.learner.predict(X)
|
return self.learner.predict(X)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SVMQ(ExplicitLossMinimisation):
|
class SVMQ(ExplicitLossMinimisationBinary):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SVMKLD(ExplicitLossMinimisation):
|
class SVMKLD(ExplicitLossMinimisationBinary):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SVMNKLD(ExplicitLossMinimisation):
|
class SVMNKLD(ExplicitLossMinimisationBinary):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SVMAE(ExplicitLossMinimisation):
|
class SVMAE(ExplicitLossMinimisationBinary):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
|
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SVMRAE(ExplicitLossMinimisation):
|
class SVMRAE(ExplicitLossMinimisationBinary):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
||||||
|
|
||||||
|
@ -438,7 +456,7 @@ CC = ClassifyAndCount
|
||||||
ACC = AdjustedClassifyAndCount
|
ACC = AdjustedClassifyAndCount
|
||||||
PCC = ProbabilisticClassifyAndCount
|
PCC = ProbabilisticClassifyAndCount
|
||||||
PACC = ProbabilisticAdjustedClassifyAndCount
|
PACC = ProbabilisticAdjustedClassifyAndCount
|
||||||
ELM = ExplicitLossMinimisation
|
ELM = ExplicitLossMinimisationBinary
|
||||||
EMQ = ExpectationMaximizationQuantifier
|
EMQ = ExpectationMaximizationQuantifier
|
||||||
HDy = HellingerDistanceY
|
HDy = HellingerDistanceY
|
||||||
|
|
||||||
|
|
|
@ -18,3 +18,48 @@ class BaseQuantifier(metaclass=ABCMeta):
|
||||||
def get_params(self, deep=True): ...
|
def get_params(self, deep=True): ...
|
||||||
|
|
||||||
|
|
||||||
|
# class OneVsAll(AggregativeQuantifier):
|
||||||
|
# """
|
||||||
|
# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
|
||||||
|
# quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||||
|
# """
|
||||||
|
#
|
||||||
|
# def __init__(self, binary_method, n_jobs=-1):
|
||||||
|
# self.binary_method = binary_method
|
||||||
|
# self.n_jobs = n_jobs
|
||||||
|
#
|
||||||
|
# def fit(self, data: LabelledCollection, **kwargs):
|
||||||
|
# assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
||||||
|
# assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
||||||
|
# self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
||||||
|
# Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||||
|
# delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
||||||
|
# )
|
||||||
|
# return self
|
||||||
|
#
|
||||||
|
# def quantify(self, X, *args):
|
||||||
|
# prevalences = np.asarray(
|
||||||
|
# Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||||
|
# delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
# return F.normalize_prevalence(prevalences)
|
||||||
|
#
|
||||||
|
# @property
|
||||||
|
# def classes(self):
|
||||||
|
# return sorted(self.class_method.keys())
|
||||||
|
#
|
||||||
|
# def set_params(self, **parameters):
|
||||||
|
# self.binary_method.set_params(**parameters)
|
||||||
|
#
|
||||||
|
# def get_params(self, deep=True):
|
||||||
|
# return self.binary_method.get_params()
|
||||||
|
#
|
||||||
|
# def _delayed_binary_predict(self, c, learners, X):
|
||||||
|
# return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence
|
||||||
|
#
|
||||||
|
# def _delayed_binary_fit(self, c, learners, data, **kwargs):
|
||||||
|
# bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||||
|
# learners[c].fit(bindata, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
29
test.py
29
test.py
|
@ -6,6 +6,7 @@ import quapy.functional as F
|
||||||
|
|
||||||
SAMPLE_SIZE=500
|
SAMPLE_SIZE=500
|
||||||
binary = False
|
binary = False
|
||||||
|
svmperf_home = './svm_perf_quantification'
|
||||||
|
|
||||||
if binary:
|
if binary:
|
||||||
# load a textual binary dataset and create a tfidf bag of words
|
# load a textual binary dataset and create a tfidf bag of words
|
||||||
|
@ -20,19 +21,31 @@ else:
|
||||||
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
||||||
test_path = './datasets/twitter/test/sst.test.feature.txt'
|
test_path = './datasets/twitter/test/sst.test.feature.txt'
|
||||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
||||||
|
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
|
||||||
|
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
||||||
|
print(dataset.training.instances.shape)
|
||||||
|
|
||||||
|
print('dataset loaded')
|
||||||
|
|
||||||
# training a quantifier
|
# training a quantifier
|
||||||
learner = LogisticRegression()
|
learner = LogisticRegression()
|
||||||
model = qp.method.aggregative.ClassifyAndCount(learner)
|
# model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||||
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
|
||||||
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||||
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
|
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
|
||||||
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
|
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
|
||||||
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
||||||
|
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
|
||||||
|
model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
|
||||||
|
|
||||||
|
if not binary:
|
||||||
|
model = qp.method.aggregative.OneVsAll(model)
|
||||||
|
|
||||||
|
print('fitting model')
|
||||||
model.fit(dataset.training)
|
model.fit(dataset.training)
|
||||||
|
|
||||||
|
|
||||||
# estimating class prevalences
|
# estimating class prevalences
|
||||||
|
print('quantifying')
|
||||||
prevalences_estim = model.quantify(dataset.test.instances)
|
prevalences_estim = model.quantify(dataset.test.instances)
|
||||||
prevalences_true = dataset.test.prevalence()
|
prevalences_true = dataset.test.prevalence()
|
||||||
|
|
||||||
|
@ -46,9 +59,17 @@ print(f'true prevalence {F.strprev(prevalences_true)}')
|
||||||
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
||||||
print(f'mae={error:.3f}')
|
print(f'mae={error:.3f}')
|
||||||
|
|
||||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE)
|
|
||||||
|
|
||||||
qp.error.SAMPLE_SIZE=SAMPLE_SIZE
|
max_evaluations = 5000
|
||||||
|
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
|
||||||
|
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
|
||||||
|
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that '
|
||||||
|
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded. '
|
||||||
|
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
|
||||||
|
|
||||||
|
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
|
||||||
|
|
||||||
|
qp.error.SAMPLE_SIZE = SAMPLE_SIZE
|
||||||
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
|
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
|
||||||
for error in qp.error.QUANTIFICATION_ERROR:
|
for error in qp.error.QUANTIFICATION_ERROR:
|
||||||
score = error(true_prev, estim_prev)
|
score = error(true_prev, estim_prev)
|
||||||
|
|
Loading…
Reference in New Issue