aggregative methods adapted. Explicit loss minimization methods (SVMQ, SVMKLD, ...) added and with support to binary or single-label. HDy added

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-04 19:32:08 +01:00
parent a882424eeb
commit 9c8d29156c
11 changed files with 244 additions and 64 deletions

View File

@ -1,3 +1,4 @@
Documentation with sphinx Documentation with sphinx
Document methods with paper references
The parallel training in svmperf seems not to work The parallel training in svmperf seems not to work
Add "prepare svmperf for quantification" script Add "prepare svmperf for quantification" script

View File

@ -20,12 +20,9 @@ class SVMperf(BaseEstimator, ClassifierMixin):
self.verbose = verbose self.verbose = verbose
self.loss = loss self.loss = loss
def set_c(self, C):
self.param_C = '-c ' + str(C)
def set_params(self, **parameters): def set_params(self, **parameters):
assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported' assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
self.set_c(parameters['C']) self.C = parameters['C']
def fit(self, X, y): def fit(self, X, y):
assert self.loss in SVMperf.valid_losses, \ assert self.loss in SVMperf.valid_losses, \
@ -33,8 +30,8 @@ class SVMperf(BaseEstimator, ClassifierMixin):
self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn') self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn')
self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify') self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify')
self.loss_cmd = '-l ' + str(self.valid_losses[self.loss]) self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss])
self.set_c(self.C) self.c_cmd = '-c ' + str(self.C)
self.classes_ = sorted(np.unique(y)) self.classes_ = sorted(np.unique(y))
self.n_classes_ = len(self.classes_) self.n_classes_ = len(self.classes_)
@ -49,7 +46,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
dump_svmlight_file(X, y, traindat, zero_based=False) dump_svmlight_file(X, y, traindat, zero_based=False)
cmd = ' '.join([self.svmperf_learn, self.param_C, self.loss_cmd, traindat, self.model]) cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model])
if self.verbose: if self.verbose:
print('[Running]', cmd) print('[Running]', cmd)
p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT) p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT)
@ -60,7 +57,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
return self return self
def predict(self, X, y=None): def predict(self, X):
confidence_scores = self.decision_function(X) confidence_scores = self.decision_function(X)
predictions = (confidence_scores > 0) * 1 predictions = (confidence_scores > 0) * 1
return predictions return predictions

View File

@ -43,13 +43,13 @@ class LabelledCollection:
@property @property
def binary(self): def binary(self):
return self.n_classes==2 return self.n_classes == 2
def sampling_index(self, size, *prevs, shuffle=True): def sampling_index(self, size, *prevs, shuffle=True):
if len(prevs) == self.n_classes-1: if len(prevs) == self.n_classes-1:
prevs = prevs + (1-sum(prevs),) prevs = prevs + (1-sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})' assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
taken = 0 taken = 0
indexes_sample = [] indexes_sample = []

View File

@ -1,5 +1,6 @@
from sklearn.metrics import f1_score from sklearn.metrics import f1_score
from settings import SAMPLE_SIZE
SAMPLE_SIZE = None
def f1e(y_true, y_pred): def f1e(y_true, y_pred):
@ -20,11 +21,21 @@ def ae(p, p_hat):
return abs(p_hat-p).mean(axis=-1) return abs(p_hat-p).mean(axis=-1)
def mrae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)): def __check_eps(eps):
if eps is None:
if SAMPLE_SIZE is None:
raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set')
else:
eps = 1. / (2. * SAMPLE_SIZE)
return eps
def mrae(p, p_hat, eps=None):
return rae(p, p_hat, eps).mean() return rae(p, p_hat, eps).mean()
def rae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)): def rae(p, p_hat, eps=None):
eps = __check_eps(eps)
p = smooth(p, eps) p = smooth(p, eps)
p_hat = smooth(p_hat, eps) p_hat = smooth(p_hat, eps)
return (abs(p-p_hat)/p).mean(axis=-1) return (abs(p-p_hat)/p).mean(axis=-1)

View File

@ -15,6 +15,26 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur
return prevs return prevs
def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
"""
Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
and with the limits smoothed, i.e.:
[0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
:return: an array of uniformly separated prevalence values
"""
p = np.linspace(0., 1., num=n_prevalences, endpoint=True)
p[0] += smooth_limits_epsilon
p[-1] -= smooth_limits_epsilon
if p[0] > p[1]:
raise ValueError(f'the smoothing in the limits is greater than the prevalence step')
if repeat > 1:
p = np.repeat(p, repeat)
return p
def prevalence_from_labels(labels, n_classes): def prevalence_from_labels(labels, n_classes):
unique, counts = np.unique(labels, return_counts=True) unique, counts = np.unique(labels, return_counts=True)
by_class = defaultdict(lambda:0, dict(zip(unique, counts))) by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
@ -47,3 +67,13 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True):
return adjusted return adjusted
def normalize_prevalence(prevalences):
assert prevalences.ndim==1, 'unexpected shape'
accum = prevalences.sum()
if accum > 0:
return prevalences / accum
else:
# if all classifiers are trivial rejectors
return np.ones_like(prevalences) / prevalences.size

View File

@ -9,6 +9,7 @@ AGGREGATIVE_METHODS = {
agg.ProbabilisticAdjustedClassifyAndCount, agg.ProbabilisticAdjustedClassifyAndCount,
agg.ExplicitLossMinimisation, agg.ExplicitLossMinimisation,
agg.ExpectationMaximizationQuantifier, agg.ExpectationMaximizationQuantifier,
agg.HellingerDistanceY
} }
NON_AGGREGATIVE_METHODS = { NON_AGGREGATIVE_METHODS = {
@ -19,12 +20,6 @@ QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
# common alisases # common alisases
CC = agg.ClassifyAndCount
ACC = agg.AdjustedClassifyAndCount
PCC = agg.ProbabilisticClassifyAndCount
PACC = agg.ProbabilisticAdjustedClassifyAndCount
ELM = agg.ExplicitLossMinimisation
EMQ = agg.ExpectationMaximizationQuantifier
MLPE = nagg.MaximumLikelihoodPrevalenceEstimation MLPE = nagg.MaximumLikelihoodPrevalenceEstimation

View File

@ -9,6 +9,8 @@ from sklearn.calibration import CalibratedClassifierCV
from joblib import Parallel, delayed from joblib import Parallel, delayed
# Abstract classes # Abstract classes
# ------------------------------------ # ------------------------------------
@ -21,8 +23,8 @@ class AggregativeQuantifier(BaseQuantifier):
@abstractmethod @abstractmethod
def fit(self, data: LabelledCollection, fit_learner=True, *args): ... def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
def classify(self, documents): def classify(self, instances):
return self.learner.predict(documents) return self.learner.predict(instances)
def get_params(self, deep=True): def get_params(self, deep=True):
return self.learner.get_params() return self.learner.get_params()
@ -70,7 +72,7 @@ def training_helper(learner,
:param fit_learner: whether or not to fit the learner :param fit_learner: whether or not to fit the learner
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
learner is not probabilistic, then a CalibratedCV instance of it is trained) learner is not probabilistic, then a CalibratedCV instance of it is trained)
:param train_val_split: if specified, indicates the proportion of training documents on which to fit the learner :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
or None otherwise) or None otherwise)
""" """
@ -118,8 +120,8 @@ class ClassifyAndCount(AggregativeQuantifier):
self.learner, _ = training_helper(self.learner, data, fit_learner) self.learner, _ = training_helper(self.learner, data, fit_learner)
return self return self
def quantify(self, documents, *args): def quantify(self, instances, *args):
classification = self.classify(documents) # classify classification = self.classify(instances) # classify
return F.prevalence_from_labels(classification, self.n_classes) # & count return F.prevalence_from_labels(classification, self.n_classes) # & count
@ -138,8 +140,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts() self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
return self return self
def quantify(self, documents, *args): def quantify(self, instances, *args):
prevs_estim = self.cc.quantify(documents) prevs_estim = self.cc.quantify(instances)
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim # solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
A = self.Pte_cond_estim_ A = self.Pte_cond_estim_
B = prevs_estim B = prevs_estim
@ -163,8 +165,8 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
return self return self
def quantify(self, documents, *args): def quantify(self, instances, *args):
posteriors = self.soft_classify(documents) # classify posteriors = self.soft_classify(instances) # classify
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
return prevalences return prevalences
@ -186,8 +188,8 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts() self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
return self return self
def quantify(self, documents, *args): def quantify(self, instances, *args):
prevs_estim = self.pcc.quantify(documents) prevs_estim = self.pcc.quantify(instances)
A = self.Pte_cond_estim_ A = self.Pte_cond_estim_
B = prevs_estim B = prevs_estim
try: try:
@ -252,53 +254,82 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
return qs return qs
# todo: from here class HellingerDistanceY(AggregativeProbabilisticQuantifier):
def train_task(c, learners, data): """
learners[c].fit(data.documents, data.labels == c) Implementation of the method based on the Hellinger Distance y (HDy) proposed by
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
estimation based on the Hellinger distance. Information Sciences, 218:146164.
"""
def __init__(self, learner):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
Px = self.soft_classify(validation.instances)
self.Pxy1 = Px[validation.labels == 1]
self.Pxy0 = Px[validation.labels == 0]
return self
def quantify(self, instances, *args):
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
# and the final estimated a priori probability was taken as the median of these 11 estimates."
# (González-Castro, et al., 2013).
Px = self.soft_classify(instances)
prev_estimations = []
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
prev_selected, min_dist = None, None
for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0):
Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density
hdy = HellingerDistanceY.HellingerDistance(Px_train, Px_test)
if prev_selected is None or hdy < min_dist:
prev_selected, min_dist = prev, hdy
prev_estimations.append(prev_selected)
pos_class_prev = np.median(prev_estimations)
return np.asarray([1-pos_class_prev, pos_class_prev])
@classmethod
def HellingerDistance(cls, P, Q):
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
def binary_quant_task(c, learners, X): class OneVsAll(AggregativeQuantifier):
predictions_ci = learners[c].predict(X)
return predictions_ci.mean() # since the predictions array is binary
def __init__(self, binary_method, n_jobs=-1, **kwargs):
class OneVsAllELM(AggregativeQuantifier): self.binary_method = binary_method
def __init__(self, svmperf_base, loss, n_jobs=-1, **kwargs):
self.svmperf_base = svmperf_base
self.loss = loss
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.kwargs = kwargs self.kwargs = kwargs
def fit(self, data: LabelledCollection, fit_learner=True, *args): def fit(self, data: LabelledCollection, **kwargs):
assert fit_learner, 'the method requires that fit_learner=True' assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_}
self.learners = {c: SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs) for c in data.classes_}
Parallel(n_jobs=self.n_jobs, backend='threading')( Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(train_task)(c, self.learners, data) for c in self.learners.keys() delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
) )
return self return self
def quantify(self, X, y=None): def quantify(self, X, *args):
prevalences = np.asarray( prevalences = np.asarray(
Parallel(n_jobs=self.n_jobs, backend='threading')( Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(binary_quant_task)(c, self.learners, X) for c in self.learners.keys() delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
) )
) )
prevalences /= prevalences.sum() print('one vs all: ', prevalences)
return prevalences return F.normalize_prevalence(prevalences)
@property @property
def classes(self): def classes(self):
return sorted(self.learners.keys()) return sorted(self.class_method.keys())
def preclassify_collection(self, data: LabelledCollection):
classifications = []
for class_ in data.classes_:
classifications.append(self.learners[class_].predict(data.instances))
classifications = np.vstack(classifications).T
precomputed = LabelledCollection(classifications, data.labels)
return precomputed
def set_params(self, **parameters): def set_params(self, **parameters):
self.kwargs=parameters self.kwargs=parameters
@ -306,20 +337,57 @@ class OneVsAllELM(AggregativeQuantifier):
def get_params(self, deep=True): def get_params(self, deep=True):
return self.kwargs return self.kwargs
def _delayed_binary_predict(self, c, learners, X):
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
def _delayed_binary_fit(self, c, learners, data, **kwargs):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
learners[c].fit(bindata, **kwargs)
class ExplicitLossMinimisation(AggregativeQuantifier): class ExplicitLossMinimisation(AggregativeQuantifier):
def __init__(self, svmperf_base, loss, **kwargs): def __init__(self, svmperf_base, loss, **kwargs):
self.learner = SVMperf(svmperf_base, loss=loss, **kwargs) self.svmperf_base = svmperf_base
self.loss = loss
self.kwargs = kwargs
def fit(self, data: LabelledCollection, fit_learner=True, *args): def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert fit_learner, 'the method requires that fit_learner=True' assert fit_learner, 'the method requires that fit_learner=True'
self.learner.fit(data.instances, data.labels) if data.binary:
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
else:
self.learner = OneVsAll(
binary_method=ExplicitLossMinimisationBinary,
n_jobs=-1,
svmperf_base=self.svmperf_base,
loss=self.loss,
**self.kwargs
)
return self.learner.fit(data, *args)
def quantify(self, instances, *args):
return self.learner.quantify(instances, *args)
class ExplicitLossMinimisationBinary(AggregativeQuantifier):
def __init__(self, svmperf_base, loss, **kwargs):
self.svmperf_base = svmperf_base
self.loss = loss
self.kwargs = kwargs
def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
assert fit_learner, 'the method requires that fit_learner=True'
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
return self return self
def quantify(self, X, y=None): def quantify(self, X, y=None):
predictions = self.learner.predict(X) predictions = self.learner.predict(X)
return F.prevalence_from_labels(predictions, self.learner.n_classes_) prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
print('binary: ', prev)
return prev
def classify(self, X, y=None): def classify(self, X, y=None):
return self.learner.predict(X) return self.learner.predict(X)
@ -349,3 +417,12 @@ class SVMRAE(ExplicitLossMinimisation):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
CC = ClassifyAndCount
ACC = AdjustedClassifyAndCount
PCC = ProbabilisticClassifyAndCount
PACC = ProbabilisticAdjustedClassifyAndCount
ELM = ExplicitLossMinimisation
EMQ = ExpectationMaximizationQuantifier
HDy = HellingerDistanceY

View File

@ -10,7 +10,7 @@ class BaseQuantifier(metaclass=ABCMeta):
def fit(self, data: qp.LabelledCollection, *args): ... def fit(self, data: qp.LabelledCollection, *args): ...
@abstractmethod @abstractmethod
def quantify(self, documents, *args): ... def quantify(self, instances, *args): ...
@abstractmethod @abstractmethod
def set_params(self, **parameters): ... def set_params(self, **parameters): ...

1
quapy/utils/__init__.py Normal file
View File

@ -0,0 +1 @@
from . import util

22
quapy/utils/util.py Normal file
View File

@ -0,0 +1,22 @@
import itertools
import multiprocessing
from joblib import Parallel, delayed
def get_parallel_slices(n_tasks, n_jobs=-1):
if n_jobs == -1:
n_jobs = multiprocessing.cpu_count()
batch = int(n_tasks / n_jobs)
remainder = n_tasks % n_jobs
return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in
range(n_jobs)]
def parallelize(func, args, n_jobs):
slices = get_parallel_slices(len(args), n_jobs)
results = Parallel(n_jobs=n_jobs)(
delayed(func)(args[slice_i]) for slice_i in slices
)
return list(itertools.chain.from_iterable(results))

46
test.py Normal file
View File

@ -0,0 +1,46 @@
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
# load a textual binary dataset and create a tfidf bag of words
train_path = './datasets/reviews/kindle/train.txt'
test_path = './datasets/reviews/kindle/test.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
dataset.test = dataset.test.sampling(500, 0.6, 0.4)
qp.preprocessing.text2tfidf(dataset, inplace=True)
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
# load a sparse matrix ternary dataset
#train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
#test_path = './datasets/twitter/test/sst.test.feature.txt'
#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5)
#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
# training a quantifier
learner = LogisticRegression()
# q = qp.method.aggregative.ClassifyAndCount(learner)
# q = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# q = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# q = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
# q = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
# q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
# q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000)
# q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000)
q = qp.method.aggregative.HDy(learner)
q.fit(dataset.training)
# estimating class prevalences
prevalences_estim = q.quantify(dataset.test.instances)
prevalences_true = dataset.test.prevalence()
# evaluation (one single prediction)
error = qp.error.mae(prevalences_true, prevalences_estim)
print(f'method {q.__class__.__name__}')
print(f'true prevalence {F.strprev(prevalences_true)}')
print(f'estim prevalence {F.strprev(prevalences_estim)}')
print(f'MAE={error:.3f}')