diff --git a/TweetSentQuant/experiments.py b/TweetSentQuant/experiments.py index cfbf97b..7b6f477 100644 --- a/TweetSentQuant/experiments.py +++ b/TweetSentQuant/experiments.py @@ -1,6 +1,6 @@ from sklearn.linear_model import LogisticRegression import quapy as qp -from quapy.method.aggregative import OneVsAll +from quapy.method.aggregative import CC, ACC, PCC, PACC, EMQ, OneVsAll, SVMQ, SVMKLD, SVMNKLD, SVMAE, SVMRAE, HDy import quapy.functional as F import numpy as np import os @@ -22,19 +22,26 @@ def quantification_models(): __C_range = np.logspace(-4, 5, 10) lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']} svmperf_params = {'C': __C_range} - yield 'cc', qp.method.aggregative.CC(newLR()), lr_params - yield 'acc', qp.method.aggregative.ACC(newLR()), lr_params - yield 'pcc', qp.method.aggregative.PCC(newLR()), lr_params - yield 'pacc', qp.method.aggregative.PACC(newLR()), lr_params - yield 'sld', qp.method.aggregative.EMQ(newLR()), lr_params - yield 'svmq', OneVsAll(qp.method.aggregative.SVMQ(args.svmperfpath)), svmperf_params - yield 'svmkld', OneVsAll(qp.method.aggregative.SVMKLD(args.svmperfpath)), svmperf_params - yield 'svmnkld', OneVsAll(qp.method.aggregative.SVMNKLD(args.svmperfpath)), svmperf_params - yield 'svmmae', OneVsAll(qp.method.aggregative.SVMAE(args.svmperfpath)), svmperf_params - yield 'svmmrae', OneVsAll(qp.method.aggregative.SVMRAE(args.svmperfpath)), svmperf_params - #sld = qp.method.aggregative.EMQ(newLR()) - #yield 'paccsld', qp.method.aggregative.PACC(sld), lr_params + # methods tested in Gao & Sebastiani 2016 + yield 'cc', CC(newLR()), lr_params + yield 'acc', ACC(newLR()), lr_params + yield 'pcc', PCC(newLR()), lr_params + yield 'pacc', PACC(newLR()), lr_params + yield 'sld', EMQ(newLR()), lr_params + yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params + yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params + yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params + + # methods added + yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params + yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params + yield 'hdy', OneVsAll(HDy(newLR())), lr_params + + # to add: + # quapy + # ensembles + # # 'mlpe': lambda learner: MaximumLikelihoodPrevalenceEstimation(), diff --git a/quapy/__init__.py b/quapy/__init__.py index 20dc49b..5b98d57 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -7,7 +7,7 @@ from . import evaluation from . import plot from . import util from . import model_selection -from quapy.method.aggregative import isaggregative, isprobabilistic +from quapy.method.base import isprobabilistic, isaggregative environ = { @@ -21,3 +21,5 @@ environ = { def isbinary(x): return x.binary + + diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 78917d1..293b709 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -8,6 +8,7 @@ import quapy as qp from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier from quapy.util import temp_seed +import quapy.functional as F def artificial_sampling_prediction( @@ -39,18 +40,18 @@ def artificial_sampling_prediction( with temp_seed(random_seed): indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions)) - if isinstance(model, qp.method.aggregative.AggregativeQuantifier): - # print('\tinstance of aggregative-quantifier') + if model.aggregative: #isinstance(model, qp.method.aggregative.AggregativeQuantifier): + print('\tinstance of aggregative-quantifier') quantification_func = model.aggregate - if isinstance(model, qp.method.aggregative.AggregativeProbabilisticQuantifier): - # print('\t\tinstance of probabilitstic-aggregative-quantifier') + if model.probabilistic: # isinstance(model, qp.method.aggregative.AggregativeProbabilisticQuantifier): + print('\t\tinstance of probabilitstic-aggregative-quantifier') preclassified_instances = model.posterior_probabilities(test.instances) else: - # print('\t\tinstance of hard-aggregative-quantifier') + print('\t\tinstance of hard-aggregative-quantifier') preclassified_instances = model.classify(test.instances) test = LabelledCollection(preclassified_instances, test.labels) else: - # print('\t\tinstance of base-quantifier') + print('\t\tinstance of base-quantifier') quantification_func = model.quantify def _predict_prevalences(index): diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 4e09945..eb1661e 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,7 +1,6 @@ from abc import abstractmethod from copy import deepcopy from typing import Union - import numpy as np from joblib import Parallel, delayed from sklearn.base import BaseEstimator @@ -60,6 +59,10 @@ class AggregativeQuantifier(BaseQuantifier): def classes(self): return self.learner.classes_ + @property + def aggregative(self): + return True + class AggregativeProbabilisticQuantifier(AggregativeQuantifier): """ @@ -84,6 +87,9 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): parameters={'base_estimator__'+k:v for k,v in parameters.items()} self.learner.set_params(**parameters) + @property + def probabilistic(self): + return True # Helper @@ -385,6 +391,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): Px = self.posterior_probabilities(validation.instances)[:,1] # takes only the P(y=+1|x) self.Pxy1 = Px[validation.labels == 1] self.Pxy0 = Px[validation.labels == 0] + # pre-compute the histogram for positive and negative examples + self.bins = np.linspace(10, 110, 11, dtype=int) #[10, 20, 30, ..., 100, 110] + self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins} + self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins} return self def aggregate(self, classif_posteriors): @@ -395,9 +405,12 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): Px = classif_posteriors[:,1] # takes only the P(y=+1|x) prev_estimations = [] - for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] - Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) - Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) + #for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] + #Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) + #Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) + for bins in self.bins: + Pxy0_density = self.Pxy0_density[bins] + Pxy1_density = self.Pxy1_density[bins] Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True) @@ -488,9 +501,7 @@ class OneVsAll(AggregativeQuantifier): assert isinstance(self.binary_quantifier, BaseQuantifier), \ f'{self.binary_quantifier} does not seem to be a Quantifier' assert fit_learner==True, 'fit_learner must be True' - if not isinstance(self.binary_quantifier, BinaryQuantifier): - raise ValueError(f'{self.binary_quantifier.__class__.__name__} does not seem to be an instance of ' - f'{BinaryQuantifier.__class__.__name__}') + self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} self.__parallel(self._delayed_binary_fit, data) return self @@ -502,20 +513,39 @@ class OneVsAll(AggregativeQuantifier): classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances) return classif_predictions_bin.T + def posterior_probabilities(self, instances): + # returns a matrix of shape (n,m,2) with n the number of instances and m the number of classes. The entry + # (i,j,1) (resp. (i,j,0)) is a value in [0,1] indicating the posterior probability that instance i belongs + # (resp. does not belong) to class j. + # The posterior probabilities are independent of each other, meaning that, in general, they do not sum + # up to one. + if not self.binary_quantifier.probabilistic: + raise NotImplementedError(f'{self.__class__.__name__} does not implement posterior_probabilities because ' + f'the base quantifier {self.binary_quantifier.__class__.__name__} is not ' + f'probabilistic') + posterior_predictions_bin = self.__parallel(self._delayed_binary_posteriors, instances) + return np.swapaxes(posterior_predictions_bin, 0, 1) + def aggregate(self, classif_predictions_bin): - assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \ - 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \ - 'predictions for each document (row) and class (columns)' + if self.probabilistic: + assert classif_predictions_bin.shape[1]==self.n_classes and classif_predictions_bin.shape[2]==2, \ + 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \ + 'probabilities (2 dimensions) for each document (row) and class (columns)' + else: + assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \ + 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \ + 'predictions for each document (row) and class (columns)' prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin) - #prevalences = [] - #for c in self.classes: - # prevalences.append(self._delayed_binary_aggregate(c, classif_predictions_bin)) - #prevalences = np.asarray(prevalences) return F.normalize_prevalence(prevalences) def quantify(self, X): - prevalences = self.__parallel(self._delayed_binary_quantify, X) - return F.normalize_prevalence(prevalences) + if self.probabilistic: + predictions = self.posterior_probabilities(X) + else: + predictions = self.classify(X) + return self.aggregate(predictions) + #prevalences = self.__parallel(self._delayed_binary_quantify, X) + #return F.normalize_prevalence(prevalences) def __parallel(self, func, *args, **kwargs): return np.asarray( @@ -537,9 +567,12 @@ class OneVsAll(AggregativeQuantifier): def _delayed_binary_classification(self, c, X): return self.dict_binary_quantifiers[c].classify(X) - def _delayed_binary_quantify(self, c, X): + def _delayed_binary_posteriors(self, c, X): + return self.dict_binary_quantifiers[c].posterior_probabilities(X) + + #def _delayed_binary_quantify(self, c, X): # the estimation for the positive class prevalence - return self.dict_binary_quantifiers[c].quantify(X)[1] + # return self.dict_binary_quantifiers[c].quantify(X)[1] def _delayed_binary_aggregate(self, c, classif_predictions): # the estimation for the positive class prevalence @@ -549,13 +582,14 @@ class OneVsAll(AggregativeQuantifier): bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) self.dict_binary_quantifiers[c].fit(bindata) + @property + def binary(self): + return False + + @property + def probabilistic(self): + return self.binary_quantifier.probabilistic -def isaggregative(model:BaseQuantifier): - return isinstance(model, AggregativeQuantifier) - - -def isprobabilistic(model:BaseQuantifier): - return isinstance(model, AggregativeProbabilisticQuantifier) diff --git a/quapy/method/base.py b/quapy/method/base.py index 9dfd74c..de53ad9 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -5,12 +5,10 @@ from quapy.data import LabelledCollection # Base Quantifier abstract class # ------------------------------------ - - class BaseQuantifier(metaclass=ABCMeta): @abstractmethod - def fit(self, data): ... + def fit(self, data: LabelledCollection): ... @abstractmethod def quantify(self, instances): ... @@ -21,10 +19,20 @@ class BaseQuantifier(metaclass=ABCMeta): @abstractmethod def get_params(self, deep=True): ... + # these methods allows meta-learners to reimplement the decision based on their constituents, and not + # based on class structure @property def binary(self): return False + @property + def aggregative(self): + return False + + @property + def probabilistic(self): + return False + class BinaryQuantifier(BaseQuantifier): def _check_binary(self, data: LabelledCollection, quantifier_name): @@ -40,7 +48,15 @@ def isbinary(model:BaseQuantifier): return model.binary -# class OneVsAll(AggregativeQuantifier): +def isaggregative(model:BaseQuantifier): + return model.aggregative + + +def isprobabilistic(model:BaseQuantifier): + return model.probabilistic + + +# class OneVsAll: # """ # Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary # quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. diff --git a/quapy/method/meta.py b/quapy/method/meta.py index cc2a473..e6a3de1 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -152,6 +152,19 @@ class Ensemble(BaseQuantifier): order = np.argsort(dist) return select_k(predictions, order, k=self.red_size) + @property + def binary(self): + return self.base_quantifier.binary + + @property + def aggregative(self): + raise NotImplementedError('aggregative functionality not yet supported for Ensemble') + + @property + def probabilistic(self): + raise NotImplementedError('probabilistic functionality not yet supported for Ensemble') + #return self.base_quantifier.probabilistic + def get_probability_distribution(posterior_probabilities, bins=8): assert posterior_probabilities.shape[1]==2, 'the posterior probabilities do not seem to be for a binary problem' diff --git a/quapy/model_selection.py b/quapy/model_selection.py index c3a2556..06fb293 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -157,7 +157,7 @@ class GridSearchQ(BaseQuantifier): model.fit(training) true_prevalences, estim_prevalences = artificial_sampling_prediction( model, validation, self.sample_size, self.n_prevpoints, self.n_repetitions, n_jobs, self.random_seed, - verbose=False + verbose=True ) score = self.error(true_prevalences, estim_prevalences)