aggregation methods updated

2020-12-09 12:46:50 +01:00 · 2020-12-09 12:46:50 +01:00 · 2361186a01
parent 9c8d29156c
commit 2361186a01
9 changed files with 71 additions and 53 deletions
--- a/quapy/init.py
+++ b/quapy/init.py
@ -1,4 +1,4 @@
-from .dataset import *
+from .data import *
 from . import functional
 from . import method
 from . import error
--- a/quapy/dataset/init.py
+++ b/quapy/dataset/init.py
--- a/quapy/dataset/base.py
+++ b/quapy/dataset/base.py
--- a/quapy/dataset/preprocessing.py
+++ b/quapy/dataset/preprocessing.py
@ -1,9 +1,10 @@
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
-from dataset.base import Dataset
+from data.base import Dataset
 from scipy.sparse import spmatrix
 from utils.util import parallelize
 from .base import LabelledCollection
+from tqdm import tqdm


 def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
    consisting of lists of integer values representing indices.
    """
-    __check_type(dataset.training.instances, list, str)
-    __check_type(dataset.test.instances, list, str)
+    __check_type(dataset.training.instances, np.ndarray, str)
+    __check_type(dataset.test.instances, np.ndarray, str)

    indexer = IndexTransformer(min_df=min_df, **kwargs)
    training_index = indexer.fit_transform(dataset.training.instances)
@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None):
            f'unexpected type of element (expected {container_type}, found {type(container)})'


-
 class IndexTransformer:

    def __init__(self, **kwargs):
@ -140,7 +140,7 @@ class IndexTransformer:
        return self.fit(X).transform(X, n_jobs=n_jobs)

    def vocabulary_size(self):
-        return len(self.vocabulary_) + 1  # the reserved unk token
+        return len(self.vocabulary_)

    def add_word(self, word):
        if word in self.vocabulary_:
--- a/quapy/dataset/reader.py
+++ b/quapy/dataset/reader.py
--- a/quapy/method/init.py
+++ b/quapy/method/init.py
@ -1,5 +1,6 @@
+from . import base
 from . import aggregative as agg
-from . import non_aggregative as nagg
+from . import non_aggregative


 AGGREGATIVE_METHODS = {
@ -13,13 +14,10 @@ AGGREGATIVE_METHODS = {
 }

 NON_AGGREGATIVE_METHODS = {
-    nagg.MaximumLikelihoodPrevalenceEstimation
+    non_aggregative.MaximumLikelihoodPrevalenceEstimation
 }

 QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS


-# common alisases
-MLPE = nagg.MaximumLikelihoodPrevalenceEstimation
-

--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -1,14 +1,14 @@
 import numpy as np
-from .base import *
-from ..error import mae
+from copy import deepcopy
 import functional as F
-from ..classification.svmperf import SVMperf
-from ..dataset import LabelledCollection
+import error
+from method.base import BaseQuantifier
+from quapy.classification.svmperf import SVMperf
+from quapy.data import LabelledCollection
 from sklearn.metrics import confusion_matrix
 from sklearn.calibration import CalibratedClassifierCV
 from joblib import Parallel, delayed
-
-
+from abc import abstractmethod


 # Abstract classes
@ -23,6 +23,14 @@ class AggregativeQuantifier(BaseQuantifier):
    @abstractmethod
    def fit(self, data: LabelledCollection, fit_learner=True, *args): ...

+    @property
+    def learner(self):
+        return self.learner_
+
+    @learner.setter
+    def learner(self, value):
+        self.learner_ = value
+
    def classify(self, instances):
        return self.learner.predict(instances)

@ -69,12 +77,12 @@ def training_helper(learner,
    Training procedure common to all Aggregative Quantifiers.
    :param learner: the learner to be fit
    :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
-    :param fit_learner: whether or not to fit the learner
+    :param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
    :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
    learner is not probabilistic, then a CalibratedCV instance of it is trained)
    :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
    :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
-    or None otherwise)
+    or None otherwise) to be used as a validation set for any subsequent parameter fitting
    """
    if fit_learner:
        if ensure_probabilistic:
@ -239,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
            # M-step: qs_pos is Ps+1(y=+1)
            qs = ps.mean(axis=0)

-            if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10:
+            if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10:
                converged = True

            qs_prev_ = qs
@ -265,7 +273,8 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
        self.learner = learner

    def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
-        assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
+        assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
+                            f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
        self.learner, validation = training_helper(
            self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
        Px = self.soft_classify(validation.instances)
@ -304,15 +313,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):


 class OneVsAll(AggregativeQuantifier):
+    """
+    Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
+    quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
+    """

-    def __init__(self, binary_method, n_jobs=-1, **kwargs):
+    def __init__(self, binary_method, n_jobs=-1):
        self.binary_method = binary_method
        self.n_jobs = n_jobs
-        self.kwargs = kwargs

    def fit(self, data: LabelledCollection, **kwargs):
        assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
-        self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_}
+        assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
+        self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
        Parallel(n_jobs=self.n_jobs, backend='threading')(
            delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
        )
@ -332,10 +345,10 @@ class OneVsAll(AggregativeQuantifier):
        return sorted(self.class_method.keys())

    def set_params(self, **parameters):
-        self.kwargs=parameters
+        self.binary_method.set_params(**parameters)

    def get_params(self, deep=True):
-        return self.kwargs
+        return self.binary_method.get_params()

    def _delayed_binary_predict(self, c, learners, X):
        return learners[c].classify(X).mean()  # the mean is the estimation for the positive class prevalence
@ -346,6 +359,12 @@ class OneVsAll(AggregativeQuantifier):


 class ExplicitLossMinimisation(AggregativeQuantifier):
+    """
+    A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
+    quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
+    This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
+    Social Network Analysis and Mining6(19), 1–22 (2016)
+    """

    def __init__(self, svmperf_base, loss, **kwargs):
        self.svmperf_base = svmperf_base
@ -354,16 +373,9 @@ class ExplicitLossMinimisation(AggregativeQuantifier):

    def fit(self, data: LabelledCollection, fit_learner=True, *args):
        assert fit_learner, 'the method requires that fit_learner=True'
-        if data.binary:
-            self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
-        else:
-            self.learner = OneVsAll(
-                binary_method=ExplicitLossMinimisationBinary,
-                n_jobs=-1,
-                svmperf_base=self.svmperf_base,
-                loss=self.loss,
-                **self.kwargs
-            )
+        self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
+        if not data.binary:
+            self.learner = OneVsAll(self.learner, n_jobs=-1)
        return self.learner.fit(data, *args)

    def quantify(self, instances, *args):
@ -393,6 +405,7 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
        return self.learner.predict(X)


+
 class SVMQ(ExplicitLossMinimisation):
    def __init__(self, svmperf_base, **kwargs):
        super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
--- a/quapy/method/base.py
+++ b/quapy/method/base.py
@ -1,5 +1,4 @@
 from abc import ABCMeta, abstractmethod
-import quapy as qp


 # Base Quantifier abstract class
@ -7,7 +6,7 @@ import quapy as qp
 class BaseQuantifier(metaclass=ABCMeta):

    @abstractmethod
-    def fit(self, data: qp.LabelledCollection, *args): ...
+    def fit(self, data, *args): ...

    @abstractmethod
    def quantify(self, instances, *args): ...
--- a/test.py
+++ b/test.py
@ -2,23 +2,25 @@ from sklearn.linear_model import LogisticRegression
 from sklearn.svm import LinearSVC
 import quapy as qp
 import quapy.functional as F
-
+from method.aggregative import OneVsAll

 # load a textual binary dataset and create a tfidf bag of words
+#from method.aggregative import OneVsAll, BaseQuantifier
+
 train_path = './datasets/reviews/kindle/train.txt'
 test_path = './datasets/reviews/kindle/test.txt'
-dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
-dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
-dataset.test = dataset.test.sampling(500, 0.6, 0.4)
-qp.preprocessing.text2tfidf(dataset, inplace=True)
-qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
+#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
+#dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
+#dataset.test = dataset.test.sampling(500, 0.6, 0.4)
+#qp.preprocessing.text2tfidf(dataset, inplace=True)
+#qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)

 # load a sparse matrix ternary dataset
-#train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
-#test_path = './datasets/twitter/test/sst.test.feature.txt'
-#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
-#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5)
-#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
+train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
+test_path = './datasets/twitter/test/sst.test.feature.txt'
+dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
+dataset.training = dataset.training.sampling(500, 0.3, 0.4, 0.3)
+dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)

 # training a quantifier
 learner = LogisticRegression()
@ -30,17 +32,23 @@ learner = LogisticRegression()
 # q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
 # q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000)
 # q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000)
-q = qp.method.aggregative.HDy(learner)
-q.fit(dataset.training)
+#model = qp.method.aggregative.HDy(learner)
+#
+
+model = qp.method.aggregative.HDy(learner)
+model = OneVsAll(model)
+print(model.get_params())
+
+model.fit(dataset.training)

 # estimating class prevalences
-prevalences_estim = q.quantify(dataset.test.instances)
+prevalences_estim = model.quantify(dataset.test.instances)
 prevalences_true  = dataset.test.prevalence()

 # evaluation (one single prediction)
 error = qp.error.mae(prevalences_true, prevalences_estim)

-print(f'method {q.__class__.__name__}')
+print(f'method {model.__class__.__name__}')
 print(f'true prevalence {F.strprev(prevalences_true)}')
 print(f'estim prevalence {F.strprev(prevalences_estim)}')
 print(f'MAE={error:.3f}')