evaluation by artificial prevalence sampling added. New methods added. New util functions added to quapy.functional and quapy.utils

2020-12-10 19:04:33 +01:00 · 2020-12-10 19:04:33 +01:00 · 9bc3a9f28a
parent a882424eeb
commit 9bc3a9f28a
17 changed files with 444 additions and 110 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,3 @@
 # QuaPy
-A Python framework for Quantification
+A Quantification framework written in Python.
--- a/TODO.txt
+++ b/TODO.txt
@ -1,3 +1,8 @@
 Documentation with sphinx
-The parallel training in svmperf seems not to work
+Add evaluation - artificial sampling
-Add "prepare svmperf for quantification" script
+Add quantification_report (akin to classification_report from sklearn)
 Add optimization - artificial sampling
 Add prediction - artificial sampling
 Add readers for typical datasets used in Quantification
 Add NAE, NRAE
 Add "measures for evaluating ordinal"?
--- a/quapy/init.py
+++ b/quapy/init.py
@ -1,6 +1,5 @@
-from .dataset import *
+from .data import *
 from . import functional
 from . import method
 from . import error
-
+from . import evaluation
--- a/quapy/classification/svmperf.py
+++ b/quapy/classification/svmperf.py
@ -20,12 +20,9 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        self.verbose = verbose
        self.loss = loss
    def set_c(self, C):
        self.param_C = '-c ' + str(C)
    def set_params(self, **parameters):
        assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
-        self.set_c(parameters['C'])
+        self.C = parameters['C']
    def fit(self, X, y):
        assert self.loss in SVMperf.valid_losses, \
@ -33,8 +30,8 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn')
        self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify')
-        self.loss_cmd = '-l ' + str(self.valid_losses[self.loss])
+        self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss])
-        self.set_c(self.C)
+        self.c_cmd = '-c ' + str(self.C)
        self.classes_ = sorted(np.unique(y))
        self.n_classes_ = len(self.classes_)
@ -49,7 +46,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        dump_svmlight_file(X, y, traindat, zero_based=False)
-        cmd = ' '.join([self.svmperf_learn, self.param_C, self.loss_cmd, traindat, self.model])
+        cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model])
        if self.verbose:
            print('[Running]', cmd)
        p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT)
@ -60,7 +57,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
        return self
-    def predict(self, X, y=None):
+    def predict(self, X):
        confidence_scores = self.decision_function(X)
        predictions = (confidence_scores > 0) * 1
        return predictions
--- a/quapy/dataset/init.py
+++ b/quapy/dataset/init.py
--- a/quapy/dataset/base.py
+++ b/quapy/dataset/base.py
@ -22,12 +22,6 @@ class LabelledCollection:
    def load(cls, path:str, loader_func:callable):
        return LabelledCollection(*loader_func(path))
    @classmethod
    def load_dataset(cls, train_path, test_path):
        training = cls.load(train_path)
        test = cls.load(test_path)
        return Dataset(training, test)
    def __len__(self):
        return self.instances.shape[0]
@ -49,7 +43,7 @@ class LabelledCollection:
        if len(prevs) == self.n_classes-1:
            prevs = prevs + (1-sum(prevs),)
        assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
-        assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})'
+        assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
        taken = 0
        indexes_sample = []
@ -93,6 +87,11 @@ class LabelledCollection:
        for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
            yield self.sampling(sample_size, *prevs)
    def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
        dimensions=self.n_classes
        for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
            yield self.sampling_index(sample_size, *prevs)
    def __add__(self, other):
        if issparse(self.instances) and issparse(other.documents):
            docs = vstack([self.instances, other.documents])
--- a/quapy/dataset/preprocessing.py
+++ b/quapy/dataset/preprocessing.py
@ -1,9 +1,10 @@
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
-from dataset.base import Dataset
+from data.base import Dataset
 from scipy.sparse import spmatrix
 from utils.util import parallelize
 from .base import LabelledCollection
 from tqdm import tqdm
 def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
    consisting of lists of integer values representing indices.
    """
-    __check_type(dataset.training.instances, list, str)
+    __check_type(dataset.training.instances, np.ndarray, str)
-    __check_type(dataset.test.instances, list, str)
+    __check_type(dataset.test.instances, np.ndarray, str)
    indexer = IndexTransformer(min_df=min_df, **kwargs)
    training_index = indexer.fit_transform(dataset.training.instances)
@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None):
            f'unexpected type of element (expected {container_type}, found {type(container)})'
 class IndexTransformer:
    def __init__(self, **kwargs):
@ -140,7 +140,7 @@ class IndexTransformer:
        return self.fit(X).transform(X, n_jobs=n_jobs)
    def vocabulary_size(self):
-        return len(self.vocabulary_) + 1  # the reserved unk token
+        return len(self.vocabulary_)
    def add_word(self, word):
        if word in self.vocabulary_:
--- a/quapy/dataset/reader.py
+++ b/quapy/dataset/reader.py
--- a/quapy/error.py
+++ b/quapy/error.py
@ -1,5 +1,8 @@
 from sklearn.metrics import f1_score
-from settings import SAMPLE_SIZE
+import numpy as np
 SAMPLE_SIZE = None
 def f1e(y_true, y_pred):
@ -7,8 +10,7 @@ def f1e(y_true, y_pred):
 def acce(y_true, y_pred):
-    acc = (y_true == y_pred).mean()
+    return 1. - (y_true == y_pred).mean()
    return 1. - acc
 def mae(prevs, prevs_hat):
@ -20,11 +22,40 @@ def ae(p, p_hat):
    return abs(p_hat-p).mean(axis=-1)
-def mrae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)):
+def mse(prevs, prevs_hat):
    return se(prevs, prevs_hat).mean()
 def se(p, p_hat):
    return ((p_hat-p)**2).mean(axis=-1)
 def mkld(prevs, prevs_hat):
    return kld(prevs, prevs_hat).mean()
 def kld(p, p_hat, eps=None):
    eps = __check_eps(eps)
    sp = p+eps
    sp_hat = p_hat + eps
    return (sp*np.log(sp/sp_hat)).sum(axis=-1)
 def mnkld(prevs, prevs_hat):
    return nkld(prevs, prevs_hat).mean()
 def nkld(p, p_hat, eps=None):
    ekld = np.exp(kld(p, p_hat, eps))
    return 2. * ekld / (1 + ekld) - 1.
 def mrae(p, p_hat, eps=None):
    return rae(p, p_hat, eps).mean()
-def rae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)):
+def rae(p, p_hat, eps=None):
    eps = __check_eps(eps)
    p = smooth(p, eps)
    p_hat = smooth(p_hat, eps)
    return (abs(p-p_hat)/p).mean(axis=-1)
@ -35,8 +66,17 @@ def smooth(p, eps):
    return (p+eps)/(eps*n_classes + 1)
 def __check_eps(eps):
    if eps is None:
        if SAMPLE_SIZE is None:
            raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set')
        else:
            eps = 1. / (2. * SAMPLE_SIZE)
    return eps
 CLASSIFICATION_ERROR = {f1e, acce}
-QUANTIFICATION_ERROR = {mae, mrae}
+QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld}
 f1_error = f1e
 acc_error = acce
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -0,0 +1,53 @@
 from data import LabelledCollection
 from method.base import BaseQuantifier
 from utils.util import temp_seed
 import numpy as np
 from joblib import Parallel, delayed
 from tqdm import tqdm
 def artificial_sampling_prediction(
        model: BaseQuantifier,
        test: LabelledCollection,
        sample_size,
        prevalence_points=21,
        point_repetitions=1,
        n_jobs=-1,
        random_seed=42):
    """
    Performs the predictions for all samples generated according to the artificial sampling protocol.
    :param model: the model in charge of generating the class prevalence estimations
    :param test: the test set on which to perform arificial sampling
    :param sample_size: the size of the samples
    :param prevalence_points: the number of different prevalences to sample
    :param point_repetitions: the number of repetitions for each prevalence
    :param n_jobs: number of jobs to be run in parallel
    :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
    any other random process.
    :return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the
     number of classes. The first one contains the true prevalences for the samples generated while the second one
     containing the the prevalences estimations
    """
    with temp_seed(random_seed):
        indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions))
    def _predict_prevalences(index):
        sample = test.sampling_from_index(index)
        true_prevalence = sample.prevalence()
        estim_prevalence = model.quantify(sample.instances)
        return true_prevalence, estim_prevalence
    results = Parallel(n_jobs=n_jobs)(
        delayed(_predict_prevalences)(index) for index in tqdm(indexes)
    )
    true_prevalences, estim_prevalences = zip(*results)
    true_prevalences = np.asarray(true_prevalences)
    estim_prevalences = np.asarray(estim_prevalences)
    return true_prevalences, estim_prevalences
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -15,6 +15,26 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur
    return prevs
 def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
    """
    Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
    and with the limits smoothed, i.e.:
    [0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
    :param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
    :param repeat: number of times each prevalence is to be repeated (defaults to 1)
    :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
    :return: an array of uniformly separated prevalence values
    """
    p = np.linspace(0., 1., num=n_prevalences, endpoint=True)
    p[0] += smooth_limits_epsilon
    p[-1] -= smooth_limits_epsilon
    if p[0] > p[1]:
        raise ValueError(f'the smoothing in the limits is greater than the prevalence step')
    if repeat > 1:
        p = np.repeat(p, repeat)
    return p
 def prevalence_from_labels(labels, n_classes):
    unique, counts = np.unique(labels, return_counts=True)
    by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
@ -47,3 +67,54 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True):
    return adjusted
 def normalize_prevalence(prevalences):
    assert prevalences.ndim==1, 'unexpected shape'
    accum = prevalences.sum()
    if accum > 0:
        return prevalences / accum
    else:
        # if all classifiers are trivial rejectors
        return np.ones_like(prevalences) / prevalences.size
 def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
    """
    Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant
    prevalences are generated and nrepeats repetitions are requested
    :param nclasses: number of classes
    :param nprevpoints: number of prevalence points.
    :param nrepeats: number of repetitions for each prevalence combination
    :return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number
    of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
    """
    __cache={}
    def __f(nc,np):
        if (nc,np) in __cache:
            return __cache[(nc,np)]
        if nc==1:
            return 1
        else:
            x = sum([__f(nc-1, np-i) for i in range(np)])
            __cache[(nc,np)] = x
            return x
    return __f(nclasses, nprevpoints) * nrepeats
 def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget):
    """
    Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that
    the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional
    simplex) do not exceed combinations_budget.
    :param nclasses: number of classes
    :param nrepeats: number of repetitions for each prevalence combination
    :param combinations_budget: maximum number of combinatios allowed
    :return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
    """
    assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers'
    nprevpoints = 1
    while True:
        combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats)
        if combinations > combinations_budget:
            return nprevpoints-1
        else:
            nprevpoints+=1
--- a/quapy/method/init.py
+++ b/quapy/method/init.py
@ -1,5 +1,6 @@
 from . import base
 from . import aggregative as agg
-from . import non_aggregative as nagg
+from . import non_aggregative
 AGGREGATIVE_METHODS = {
@ -9,22 +10,14 @@ AGGREGATIVE_METHODS = {
    agg.ProbabilisticAdjustedClassifyAndCount,
    agg.ExplicitLossMinimisation,
    agg.ExpectationMaximizationQuantifier,
    agg.HellingerDistanceY
 }
 NON_AGGREGATIVE_METHODS = {
-    nagg.MaximumLikelihoodPrevalenceEstimation
+    non_aggregative.MaximumLikelihoodPrevalenceEstimation
 }
 QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
 # common alisases
 CC = agg.ClassifyAndCount
 ACC = agg.AdjustedClassifyAndCount
 PCC = agg.ProbabilisticClassifyAndCount
 PACC = agg.ProbabilisticAdjustedClassifyAndCount
 ELM = agg.ExplicitLossMinimisation
 EMQ = agg.ExpectationMaximizationQuantifier
 MLPE = nagg.MaximumLikelihoodPrevalenceEstimation
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -1,12 +1,14 @@
 import numpy as np
-from .base import *
+from copy import deepcopy
 from ..error import mae
 import functional as F
-from ..classification.svmperf import SVMperf
+import error
-from ..dataset import LabelledCollection
+from method.base import BaseQuantifier
 from quapy.classification.svmperf import SVMperf
 from quapy.data import LabelledCollection
 from sklearn.metrics import confusion_matrix
 from sklearn.calibration import CalibratedClassifierCV
 from joblib import Parallel, delayed
 from abc import abstractmethod
 # Abstract classes
@ -21,8 +23,16 @@ class AggregativeQuantifier(BaseQuantifier):
    @abstractmethod
    def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
-    def classify(self, documents):
+    @property
-        return self.learner.predict(documents)
+    def learner(self):
        return self.learner_
    @learner.setter
    def learner(self, value):
        self.learner_ = value
    def classify(self, instances):
        return self.learner.predict(instances)
    def get_params(self, deep=True):
        return self.learner.get_params()
@ -67,12 +77,12 @@ def training_helper(learner,
    Training procedure common to all Aggregative Quantifiers.
    :param learner: the learner to be fit
    :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
-    :param fit_learner: whether or not to fit the learner
+    :param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
    :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
    learner is not probabilistic, then a CalibratedCV instance of it is trained)
-    :param train_val_split: if specified, indicates the proportion of training documents on which to fit the learner
+    :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
    :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
-    or None otherwise)
+    or None otherwise) to be used as a validation set for any subsequent parameter fitting
    """
    if fit_learner:
        if ensure_probabilistic:
@ -118,8 +128,8 @@ class ClassifyAndCount(AggregativeQuantifier):
        self.learner, _ = training_helper(self.learner, data, fit_learner)
        return self
-    def quantify(self, documents, *args):
+    def quantify(self, instances, *args):
-        classification = self.classify(documents)  # classify
+        classification = self.classify(instances)  # classify
        return F.prevalence_from_labels(classification, self.n_classes)  # & count
@ -138,8 +148,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
        self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
        return self
-    def quantify(self, documents, *args):
+    def quantify(self, instances, *args):
-        prevs_estim = self.cc.quantify(documents)
+        prevs_estim = self.cc.quantify(instances)
        # solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
        A = self.Pte_cond_estim_
        B = prevs_estim
@ -163,8 +173,8 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
        self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
        return self
-    def quantify(self, documents, *args):
+    def quantify(self, instances, *args):
-        posteriors = self.soft_classify(documents)                        # classify
+        posteriors = self.soft_classify(instances)  # classify
        prevalences = F.prevalence_from_probabilities(posteriors, binarize=False)  # & count
        return prevalences
@ -186,8 +196,8 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
        self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
        return self
-    def quantify(self, documents, *args):
+    def quantify(self, instances, *args):
-        prevs_estim = self.pcc.quantify(documents)
+        prevs_estim = self.pcc.quantify(instances)
        A = self.Pte_cond_estim_
        B = prevs_estim
        try:
@ -237,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
            # M-step: qs_pos is Ps+1(y=+1)
            qs = ps.mean(axis=0)
-            if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10:
+            if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10:
                converged = True
            qs_prev_ = qs
@ -252,79 +262,149 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
        return qs
-# todo: from here
+class HellingerDistanceY(AggregativeProbabilisticQuantifier):
-def train_task(c, learners, data):
+    """
-    learners[c].fit(data.documents, data.labels == c)
+    Implementation of the method based on the Hellinger Distance y (HDy) proposed by
    González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
    estimation based on the Hellinger distance. Information Sciences, 218:146–164.
    """
    def __init__(self, learner):
        self.learner = learner
    def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
        assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
                            f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
        self.learner, validation = training_helper(
            self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
        Px = self.soft_classify(validation.instances)
        self.Pxy1 = Px[validation.labels == 1]
        self.Pxy0 = Px[validation.labels == 0]
        return self
    def quantify(self, instances, *args):
        # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
        # and the final estimated a priori probability was taken as the median of these 11 estimates."
        # (González-Castro, et al., 2013).
        Px = self.soft_classify(instances)
        prev_estimations = []
        for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
            Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
            Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
            Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
            prev_selected, min_dist = None, None
            for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0):
                Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density
                hdy = HellingerDistanceY.HellingerDistance(Px_train, Px_test)
                if prev_selected is None or hdy < min_dist:
                    prev_selected, min_dist = prev, hdy
            prev_estimations.append(prev_selected)
        pos_class_prev = np.median(prev_estimations)
        return np.asarray([1-pos_class_prev, pos_class_prev])
    @classmethod
    def HellingerDistance(cls, P, Q):
        return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
-def binary_quant_task(c, learners, X):
+class OneVsAll(AggregativeQuantifier):
-    predictions_ci = learners[c].predict(X)
+    """
-    return predictions_ci.mean()  # since the predictions array is binary
+    Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
    quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
    """
    def __init__(self, binary_method, n_jobs=-1):
        self.binary_method = binary_method
        self.n_jobs = n_jobs
    def fit(self, data: LabelledCollection, **kwargs):
        assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
        assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
        self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
        Parallel(n_jobs=self.n_jobs, backend='threading')(
            delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
        )
        return self
    def quantify(self, X, *args):
        prevalences = np.asarray(
            Parallel(n_jobs=self.n_jobs, backend='threading')(
                delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
            )
        )
        return F.normalize_prevalence(prevalences)
    @property
    def classes(self):
        return sorted(self.class_method.keys())
    def set_params(self, **parameters):
        self.binary_method.set_params(**parameters)
    def get_params(self, deep=True):
        return self.binary_method.get_params()
    def _delayed_binary_predict(self, c, learners, X):
        return learners[c].classify(X).mean()  # the mean is the estimation for the positive class prevalence
    def _delayed_binary_fit(self, c, learners, data, **kwargs):
        bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
        learners[c].fit(bindata, **kwargs)
-class OneVsAllELM(AggregativeQuantifier):
+class ExplicitLossMinimisation(AggregativeQuantifier):
    """
    A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
    quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
    This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
    Social Network Analysis and Mining6(19), 1–22 (2016)
    """
-    def __init__(self, svmperf_base, loss, n_jobs=-1, **kwargs):
+    def __init__(self, svmperf_base, loss, **kwargs):
        self.svmperf_base = svmperf_base
        self.loss = loss
        self.n_jobs = n_jobs
        self.kwargs = kwargs
    def fit(self, data: LabelledCollection, fit_learner=True, *args):
        assert fit_learner, 'the method requires that fit_learner=True'
        self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
        if not data.binary:
            self.learner = OneVsAll(self.learner, n_jobs=-1)
        return self.learner.fit(data, *args)
-        self.learners = {c: SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs) for c in data.classes_}
+    def quantify(self, instances, *args):
-        Parallel(n_jobs=self.n_jobs, backend='threading')(
+        return self.learner.quantify(instances, *args)
            delayed(train_task)(c, self.learners, data) for c in self.learners.keys()
        )
        return self
    def quantify(self, X, y=None):
        prevalences = np.asarray(
            Parallel(n_jobs=self.n_jobs, backend='threading')(
                delayed(binary_quant_task)(c, self.learners, X) for c in self.learners.keys()
            )
        )
        prevalences /= prevalences.sum()
        return prevalences
    @property
    def classes(self):
        return sorted(self.learners.keys())
    def preclassify_collection(self, data: LabelledCollection):
        classifications = []
        for class_ in data.classes_:
            classifications.append(self.learners[class_].predict(data.instances))
        classifications = np.vstack(classifications).T
        precomputed = LabelledCollection(classifications, data.labels)
        return precomputed
    def set_params(self, **parameters):
        self.kwargs=parameters
    def get_params(self, deep=True):
        return self.kwargs
-class ExplicitLossMinimisation(AggregativeQuantifier):
+class ExplicitLossMinimisationBinary(AggregativeQuantifier):
    def __init__(self, svmperf_base, loss, **kwargs):
-        self.learner = SVMperf(svmperf_base, loss=loss, **kwargs)
+        self.svmperf_base = svmperf_base
        self.loss = loss
        self.kwargs = kwargs
    def fit(self, data: LabelledCollection, fit_learner=True, *args):
        assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
        assert fit_learner, 'the method requires that fit_learner=True'
-        self.learner.fit(data.instances, data.labels)
+        self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
        return self
    def quantify(self, X, y=None):
        predictions = self.learner.predict(X)
-        return F.prevalence_from_labels(predictions, self.learner.n_classes_)
+        prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
        print('binary: ', prev)
        return prev
    def classify(self, X, y=None):
        return self.learner.predict(X)
 class SVMQ(ExplicitLossMinimisation):
    def __init__(self, svmperf_base, **kwargs):
        super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
@ -349,3 +429,12 @@ class SVMRAE(ExplicitLossMinimisation):
    def __init__(self, svmperf_base, **kwargs):
        super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
 CC = ClassifyAndCount
 ACC = AdjustedClassifyAndCount
 PCC = ProbabilisticClassifyAndCount
 PACC = ProbabilisticAdjustedClassifyAndCount
 ELM = ExplicitLossMinimisation
 EMQ = ExpectationMaximizationQuantifier
 HDy = HellingerDistanceY
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@ -1,5 +1,4 @@
 from abc import ABCMeta, abstractmethod
 import quapy as qp
 # Base Quantifier abstract class
@ -7,10 +6,10 @@ import quapy as qp
 class BaseQuantifier(metaclass=ABCMeta):
    @abstractmethod
-    def fit(self, data: qp.LabelledCollection, *args): ...
+    def fit(self, data, *args): ...
    @abstractmethod
-    def quantify(self, documents, *args): ...
+    def quantify(self, instances, *args): ...
    @abstractmethod
    def set_params(self, **parameters): ...
--- a/quapy/utils/init.py
+++ b/quapy/utils/init.py
@ -0,0 +1 @@
 from . import util
--- a/quapy/utils/util.py
+++ b/quapy/utils/util.py
@ -0,0 +1,35 @@
 import itertools
 import multiprocessing
 from joblib import Parallel, delayed
 import contextlib
 import numpy as np
 def get_parallel_slices(n_tasks, n_jobs=-1):
    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()
    batch = int(n_tasks / n_jobs)
    remainder = n_tasks % n_jobs
    return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in
            range(n_jobs)]
 def parallelize(func, args, n_jobs):
    args = np.asarray(args)
    slices = get_parallel_slices(len(args), n_jobs)
    results = Parallel(n_jobs=n_jobs)(
        delayed(func)(args[slice_i]) for slice_i in slices
    )
    return list(itertools.chain.from_iterable(results))
@contextlib.contextmanager
 def temp_seed(seed):
    state = np.random.get_state()
    np.random.seed(seed)
    try:
        yield
    finally:
        np.random.set_state(state)
--- a/test.py
+++ b/test.py
@ -0,0 +1,53 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.svm import LinearSVC
 import quapy as qp
 import quapy.functional as F
 SAMPLE_SIZE=500
 binary = False
 if binary:
    # load a textual binary dataset and create a tfidf bag of words
    train_path = './datasets/reviews/kindle/train.txt'
    test_path = './datasets/reviews/kindle/test.txt'
    dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
    qp.preprocessing.text2tfidf(dataset, inplace=True)
    qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
 else:
    # load a sparse matrix ternary dataset
    train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
    test_path = './datasets/twitter/test/sst.test.feature.txt'
    dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
 # training a quantifier
 learner = LogisticRegression()
 model = qp.method.aggregative.ClassifyAndCount(learner)
 # model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
 # model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
 # model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
 # model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
 # model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
 model.fit(dataset.training)
 # estimating class prevalences
 prevalences_estim = model.quantify(dataset.test.instances)
 prevalences_true  = dataset.test.prevalence()
 # evaluation (one single prediction)
 error = qp.error.mae(prevalences_true, prevalences_estim)
 print(f'method {model.__class__.__name__}')
 print(f'Evaluation in test (1 eval)')
 print(f'true prevalence {F.strprev(prevalences_true)}')
 print(f'estim prevalence {F.strprev(prevalences_estim)}')
 print(f'mae={error:.3f}')
 true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE)
 qp.error.SAMPLE_SIZE=SAMPLE_SIZE
 print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
 for error in qp.error.QUANTIFICATION_ERROR:
    score = error(true_prev, estim_prev)
    print(f'{error.__name__}={score:.5f}')
`@ -1,3 +1,3 @@`
	`# QuaPy`	`# QuaPy`

	`A Python framework for Quantification`	`A Quantification framework written in Python.`