aggregative methods adapted. Explicit loss minimization methods (SVMQ, SVMKLD, ...) added and with support to binary or single-label. HDy added

2020-12-04 19:32:08 +01:00 · 2020-12-04 19:32:08 +01:00 · 9c8d29156c
parent a882424eeb
commit 9c8d29156c
11 changed files with 244 additions and 64 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,3 +1,4 @@
 Documentation with sphinx
 Document methods with paper references
 The parallel training in svmperf seems not to work
 Add "prepare svmperf for quantification" script
--- a/quapy/classification/svmperf.py
+++ b/quapy/classification/svmperf.py
@ -20,12 +20,9 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        self.verbose = verbose
        self.loss = loss
    def set_c(self, C):
        self.param_C = '-c ' + str(C)
    def set_params(self, **parameters):
        assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
-        self.set_c(parameters['C'])
+        self.C = parameters['C']
    def fit(self, X, y):
        assert self.loss in SVMperf.valid_losses, \
@ -33,8 +30,8 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn')
        self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify')
-        self.loss_cmd = '-l ' + str(self.valid_losses[self.loss])
+        self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss])
-        self.set_c(self.C)
+        self.c_cmd = '-c ' + str(self.C)
        self.classes_ = sorted(np.unique(y))
        self.n_classes_ = len(self.classes_)
@ -49,7 +46,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        dump_svmlight_file(X, y, traindat, zero_based=False)
-        cmd = ' '.join([self.svmperf_learn, self.param_C, self.loss_cmd, traindat, self.model])
+        cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model])
        if self.verbose:
            print('[Running]', cmd)
        p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT)
@ -60,7 +57,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        return self
-    def predict(self, X, y=None):
+    def predict(self, X):
        confidence_scores = self.decision_function(X)
        predictions = (confidence_scores > 0) * 1
        return predictions
--- a/quapy/dataset/base.py
+++ b/quapy/dataset/base.py
@ -43,13 +43,13 @@ class LabelledCollection:
    @property
    def binary(self):
-        return self.n_classes==2
+        return self.n_classes == 2
    def sampling_index(self, size, *prevs, shuffle=True):
        if len(prevs) == self.n_classes-1:
            prevs = prevs + (1-sum(prevs),)
        assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
-        assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})'
+        assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
        taken = 0
        indexes_sample = []
--- a/quapy/error.py
+++ b/quapy/error.py
@ -1,5 +1,6 @@
 from sklearn.metrics import f1_score
-from settings import SAMPLE_SIZE
+
 SAMPLE_SIZE = None
 def f1e(y_true, y_pred):
@ -20,11 +21,21 @@ def ae(p, p_hat):
    return abs(p_hat-p).mean(axis=-1)
-def mrae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)):
+def __check_eps(eps):
    if eps is None:
        if SAMPLE_SIZE is None:
            raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set')
        else:
            eps = 1. / (2. * SAMPLE_SIZE)
    return eps
 def mrae(p, p_hat, eps=None):
    return rae(p, p_hat, eps).mean()
-def rae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)):
+def rae(p, p_hat, eps=None):
    eps = __check_eps(eps)
    p = smooth(p, eps)
    p_hat = smooth(p_hat, eps)
    return (abs(p-p_hat)/p).mean(axis=-1)
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -15,6 +15,26 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur
    return prevs
 def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
    """
    Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
    and with the limits smoothed, i.e.:
    [0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
    :param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
    :param repeat: number of times each prevalence is to be repeated (defaults to 1)
    :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
    :return: an array of uniformly separated prevalence values
    """
    p = np.linspace(0., 1., num=n_prevalences, endpoint=True)
    p[0] += smooth_limits_epsilon
    p[-1] -= smooth_limits_epsilon
    if p[0] > p[1]:
        raise ValueError(f'the smoothing in the limits is greater than the prevalence step')
    if repeat > 1:
        p = np.repeat(p, repeat)
    return p
 def prevalence_from_labels(labels, n_classes):
    unique, counts = np.unique(labels, return_counts=True)
    by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
@ -47,3 +67,13 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True):
    return adjusted
 def normalize_prevalence(prevalences):
    assert prevalences.ndim==1, 'unexpected shape'
    accum = prevalences.sum()
    if accum > 0:
        return prevalences / accum
    else:
        # if all classifiers are trivial rejectors
        return np.ones_like(prevalences) / prevalences.size
--- a/quapy/method/init.py
+++ b/quapy/method/init.py
@ -9,6 +9,7 @@ AGGREGATIVE_METHODS = {
    agg.ProbabilisticAdjustedClassifyAndCount,
    agg.ExplicitLossMinimisation,
    agg.ExpectationMaximizationQuantifier,
    agg.HellingerDistanceY
 }
 NON_AGGREGATIVE_METHODS = {
@ -19,12 +20,6 @@ QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
 # common alisases
 CC = agg.ClassifyAndCount
 ACC = agg.AdjustedClassifyAndCount
 PCC = agg.ProbabilisticClassifyAndCount
 PACC = agg.ProbabilisticAdjustedClassifyAndCount
 ELM = agg.ExplicitLossMinimisation
 EMQ = agg.ExpectationMaximizationQuantifier
 MLPE = nagg.MaximumLikelihoodPrevalenceEstimation
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -9,6 +9,8 @@ from sklearn.calibration import CalibratedClassifierCV
 from joblib import Parallel, delayed
 # Abstract classes
 # ------------------------------------
@ -21,8 +23,8 @@ class AggregativeQuantifier(BaseQuantifier):
    @abstractmethod
    def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
-    def classify(self, documents):
+    def classify(self, instances):
-        return self.learner.predict(documents)
+        return self.learner.predict(instances)
    def get_params(self, deep=True):
        return self.learner.get_params()
@ -70,7 +72,7 @@ def training_helper(learner,
    :param fit_learner: whether or not to fit the learner
    :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
    learner is not probabilistic, then a CalibratedCV instance of it is trained)
-    :param train_val_split: if specified, indicates the proportion of training documents on which to fit the learner
+    :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
    :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
    or None otherwise)
    """
@ -118,8 +120,8 @@ class ClassifyAndCount(AggregativeQuantifier):
        self.learner, _ = training_helper(self.learner, data, fit_learner)
        return self
-    def quantify(self, documents, *args):
+    def quantify(self, instances, *args):
-        classification = self.classify(documents)  # classify
+        classification = self.classify(instances)  # classify
        return F.prevalence_from_labels(classification, self.n_classes)  # & count
@ -138,8 +140,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
        self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
        return self
-    def quantify(self, documents, *args):
+    def quantify(self, instances, *args):
-        prevs_estim = self.cc.quantify(documents)
+        prevs_estim = self.cc.quantify(instances)
        # solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
        A = self.Pte_cond_estim_
        B = prevs_estim
@ -163,8 +165,8 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
        self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
        return self
-    def quantify(self, documents, *args):
+    def quantify(self, instances, *args):
-        posteriors = self.soft_classify(documents)                        # classify
+        posteriors = self.soft_classify(instances)  # classify
        prevalences = F.prevalence_from_probabilities(posteriors, binarize=False)  # & count
        return prevalences
@ -186,8 +188,8 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
        self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
        return self
-    def quantify(self, documents, *args):
+    def quantify(self, instances, *args):
-        prevs_estim = self.pcc.quantify(documents)
+        prevs_estim = self.pcc.quantify(instances)
        A = self.Pte_cond_estim_
        B = prevs_estim
        try:
@ -252,53 +254,82 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
        return qs
-# todo: from here
+class HellingerDistanceY(AggregativeProbabilisticQuantifier):
-def train_task(c, learners, data):
+    """
-    learners[c].fit(data.documents, data.labels == c)
+    Implementation of the method based on the Hellinger Distance y (HDy) proposed by
    González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
    estimation based on the Hellinger distance. Information Sciences, 218:146–164.
    """
    def __init__(self, learner):
        self.learner = learner
    def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
        assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
        self.learner, validation = training_helper(
            self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
        Px = self.soft_classify(validation.instances)
        self.Pxy1 = Px[validation.labels == 1]
        self.Pxy0 = Px[validation.labels == 0]
        return self
    def quantify(self, instances, *args):
        # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
        # and the final estimated a priori probability was taken as the median of these 11 estimates."
        # (González-Castro, et al., 2013).
        Px = self.soft_classify(instances)
        prev_estimations = []
        for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
            Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
            Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
            Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
            prev_selected, min_dist = None, None
            for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0):
                Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density
                hdy = HellingerDistanceY.HellingerDistance(Px_train, Px_test)
                if prev_selected is None or hdy < min_dist:
                    prev_selected, min_dist = prev, hdy
            prev_estimations.append(prev_selected)
        pos_class_prev = np.median(prev_estimations)
        return np.asarray([1-pos_class_prev, pos_class_prev])
    @classmethod
    def HellingerDistance(cls, P, Q):
        return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
-def binary_quant_task(c, learners, X):
+class OneVsAll(AggregativeQuantifier):
    predictions_ci = learners[c].predict(X)
    return predictions_ci.mean()  # since the predictions array is binary
-
+    def __init__(self, binary_method, n_jobs=-1, **kwargs):
-class OneVsAllELM(AggregativeQuantifier):
+        self.binary_method = binary_method
    def __init__(self, svmperf_base, loss, n_jobs=-1, **kwargs):
        self.svmperf_base = svmperf_base
        self.loss = loss
        self.n_jobs = n_jobs
        self.kwargs = kwargs
-    def fit(self, data: LabelledCollection, fit_learner=True, *args):
+    def fit(self, data: LabelledCollection, **kwargs):
-        assert fit_learner, 'the method requires that fit_learner=True'
+        assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
-
+        self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_}
        self.learners = {c: SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs) for c in data.classes_}
        Parallel(n_jobs=self.n_jobs, backend='threading')(
-            delayed(train_task)(c, self.learners, data) for c in self.learners.keys()
+            delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
        )
        return self
-    def quantify(self, X, y=None):
+    def quantify(self, X, *args):
        prevalences = np.asarray(
            Parallel(n_jobs=self.n_jobs, backend='threading')(
-                delayed(binary_quant_task)(c, self.learners, X) for c in self.learners.keys()
+                delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
            )
        )
-        prevalences /= prevalences.sum()
+        print('one vs all: ', prevalences)
-        return prevalences
+        return F.normalize_prevalence(prevalences)
    @property
    def classes(self):
-        return sorted(self.learners.keys())
+        return sorted(self.class_method.keys())
    def preclassify_collection(self, data: LabelledCollection):
        classifications = []
        for class_ in data.classes_:
            classifications.append(self.learners[class_].predict(data.instances))
        classifications = np.vstack(classifications).T
        precomputed = LabelledCollection(classifications, data.labels)
        return precomputed
    def set_params(self, **parameters):
        self.kwargs=parameters
@ -306,20 +337,57 @@ class OneVsAllELM(AggregativeQuantifier):
    def get_params(self, deep=True):
        return self.kwargs
    def _delayed_binary_predict(self, c, learners, X):
        return learners[c].classify(X).mean()  # the mean is the estimation for the positive class prevalence
    def _delayed_binary_fit(self, c, learners, data, **kwargs):
        bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
        learners[c].fit(bindata, **kwargs)
 class ExplicitLossMinimisation(AggregativeQuantifier):
    def __init__(self, svmperf_base, loss, **kwargs):
-        self.learner = SVMperf(svmperf_base, loss=loss, **kwargs)
+        self.svmperf_base = svmperf_base
        self.loss = loss
        self.kwargs = kwargs
    def fit(self, data: LabelledCollection, fit_learner=True, *args):
        assert fit_learner, 'the method requires that fit_learner=True'
-        self.learner.fit(data.instances, data.labels)
+        if data.binary:
            self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
        else:
            self.learner = OneVsAll(
                binary_method=ExplicitLossMinimisationBinary,
                n_jobs=-1,
                svmperf_base=self.svmperf_base,
                loss=self.loss,
                **self.kwargs
            )
        return self.learner.fit(data, *args)
    def quantify(self, instances, *args):
        return self.learner.quantify(instances, *args)
 class ExplicitLossMinimisationBinary(AggregativeQuantifier):
    def __init__(self, svmperf_base, loss, **kwargs):
        self.svmperf_base = svmperf_base
        self.loss = loss
        self.kwargs = kwargs
    def fit(self, data: LabelledCollection, fit_learner=True, *args):
        assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
        assert fit_learner, 'the method requires that fit_learner=True'
        self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
        return self
    def quantify(self, X, y=None):
        predictions = self.learner.predict(X)
-        return F.prevalence_from_labels(predictions, self.learner.n_classes_)
+        prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
        print('binary: ', prev)
        return prev
    def classify(self, X, y=None):
        return self.learner.predict(X)
@ -349,3 +417,12 @@ class SVMRAE(ExplicitLossMinimisation):
    def __init__(self, svmperf_base, **kwargs):
        super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
 CC = ClassifyAndCount
 ACC = AdjustedClassifyAndCount
 PCC = ProbabilisticClassifyAndCount
 PACC = ProbabilisticAdjustedClassifyAndCount
 ELM = ExplicitLossMinimisation
 EMQ = ExpectationMaximizationQuantifier
 HDy = HellingerDistanceY
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@ -10,7 +10,7 @@ class BaseQuantifier(metaclass=ABCMeta):
    def fit(self, data: qp.LabelledCollection, *args): ...
    @abstractmethod
-    def quantify(self, documents, *args): ...
+    def quantify(self, instances, *args): ...
    @abstractmethod
    def set_params(self, **parameters): ...
--- a/quapy/utils/init.py
+++ b/quapy/utils/init.py
@ -0,0 +1 @@
 from . import util
--- a/quapy/utils/util.py
+++ b/quapy/utils/util.py
@ -0,0 +1,22 @@
 import itertools
 import multiprocessing
 from joblib import Parallel, delayed
 def get_parallel_slices(n_tasks, n_jobs=-1):
    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()
    batch = int(n_tasks / n_jobs)
    remainder = n_tasks % n_jobs
    return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in
            range(n_jobs)]
 def parallelize(func, args, n_jobs):
    slices = get_parallel_slices(len(args), n_jobs)
    results = Parallel(n_jobs=n_jobs)(
        delayed(func)(args[slice_i]) for slice_i in slices
    )
    return list(itertools.chain.from_iterable(results))
--- a/test.py
+++ b/test.py
@ -0,0 +1,46 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import LinearSVC
 import quapy as qp
 import quapy.functional as F
 # load a textual binary dataset and create a tfidf bag of words
 train_path = './datasets/reviews/kindle/train.txt'
 test_path = './datasets/reviews/kindle/test.txt'
 dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
 dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
 dataset.test = dataset.test.sampling(500, 0.6, 0.4)
 qp.preprocessing.text2tfidf(dataset, inplace=True)
 qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
 # load a sparse matrix ternary dataset
 #train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
 #test_path = './datasets/twitter/test/sst.test.feature.txt'
 #dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
 #dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5)
 #dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
 # training a quantifier
 learner = LogisticRegression()
 # q = qp.method.aggregative.ClassifyAndCount(learner)
 # q = qp.method.aggregative.AdjustedClassifyAndCount(learner)
 # q = qp.method.aggregative.AdjustedClassifyAndCount(learner)
 # q = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
 # q = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
 # q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
 # q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000)
 # q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000)
 q = qp.method.aggregative.HDy(learner)
 q.fit(dataset.training)
 # estimating class prevalences
 prevalences_estim = q.quantify(dataset.test.instances)
 prevalences_true  = dataset.test.prevalence()
 # evaluation (one single prediction)
 error = qp.error.mae(prevalences_true, prevalences_estim)
 print(f'method {q.__class__.__name__}')
 print(f'true prevalence {F.strprev(prevalences_true)}')
 print(f'estim prevalence {F.strprev(prevalences_estim)}')
 print(f'MAE={error:.3f}')