diff --git a/README.md b/README.md index 668576f..aed5f1b 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,3 @@ # QuaPy -A Python framework for Quantification \ No newline at end of file +A Quantification framework written in Python. \ No newline at end of file diff --git a/TODO.txt b/TODO.txt index d25ed25..02882af 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,3 +1,8 @@ Documentation with sphinx -The parallel training in svmperf seems not to work -Add "prepare svmperf for quantification" script \ No newline at end of file +Add evaluation - artificial sampling +Add quantification_report (akin to classification_report from sklearn) +Add optimization - artificial sampling +Add prediction - artificial sampling +Add readers for typical datasets used in Quantification +Add NAE, NRAE +Add "measures for evaluating ordinal"? diff --git a/quapy/__init__.py b/quapy/__init__.py index 59e21fe..19dc14e 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,6 +1,5 @@ -from .dataset import * +from .data import * from . import functional from . import method from . import error - - +from . import evaluation diff --git a/quapy/classification/svmperf.py b/quapy/classification/svmperf.py index eb788c4..ceab225 100644 --- a/quapy/classification/svmperf.py +++ b/quapy/classification/svmperf.py @@ -20,12 +20,9 @@ class SVMperf(BaseEstimator, ClassifierMixin): self.verbose = verbose self.loss = loss - def set_c(self, C): - self.param_C = '-c ' + str(C) - def set_params(self, **parameters): assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported' - self.set_c(parameters['C']) + self.C = parameters['C'] def fit(self, X, y): assert self.loss in SVMperf.valid_losses, \ @@ -33,8 +30,8 @@ class SVMperf(BaseEstimator, ClassifierMixin): self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn') self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify') - self.loss_cmd = '-l ' + str(self.valid_losses[self.loss]) - self.set_c(self.C) + self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss]) + self.c_cmd = '-c ' + str(self.C) self.classes_ = sorted(np.unique(y)) self.n_classes_ = len(self.classes_) @@ -49,7 +46,7 @@ class SVMperf(BaseEstimator, ClassifierMixin): dump_svmlight_file(X, y, traindat, zero_based=False) - cmd = ' '.join([self.svmperf_learn, self.param_C, self.loss_cmd, traindat, self.model]) + cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model]) if self.verbose: print('[Running]', cmd) p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT) @@ -60,7 +57,7 @@ class SVMperf(BaseEstimator, ClassifierMixin): return self - def predict(self, X, y=None): + def predict(self, X): confidence_scores = self.decision_function(X) predictions = (confidence_scores > 0) * 1 return predictions diff --git a/quapy/dataset/__init__.py b/quapy/data/__init__.py similarity index 100% rename from quapy/dataset/__init__.py rename to quapy/data/__init__.py diff --git a/quapy/dataset/base.py b/quapy/data/base.py similarity index 91% rename from quapy/dataset/base.py rename to quapy/data/base.py index 29a188f..ce7b6d9 100644 --- a/quapy/dataset/base.py +++ b/quapy/data/base.py @@ -22,12 +22,6 @@ class LabelledCollection: def load(cls, path:str, loader_func:callable): return LabelledCollection(*loader_func(path)) - @classmethod - def load_dataset(cls, train_path, test_path): - training = cls.load(train_path) - test = cls.load(test_path) - return Dataset(training, test) - def __len__(self): return self.instances.shape[0] @@ -43,13 +37,13 @@ class LabelledCollection: @property def binary(self): - return self.n_classes==2 + return self.n_classes == 2 def sampling_index(self, size, *prevs, shuffle=True): if len(prevs) == self.n_classes-1: prevs = prevs + (1-sum(prevs),) assert len(prevs) == self.n_classes, 'unexpected number of prevalences' - assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})' + assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' taken = 0 indexes_sample = [] @@ -93,6 +87,11 @@ class LabelledCollection: for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): yield self.sampling(sample_size, *prevs) + def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1): + dimensions=self.n_classes + for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): + yield self.sampling_index(sample_size, *prevs) + def __add__(self, other): if issparse(self.instances) and issparse(other.documents): docs = vstack([self.instances, other.documents]) diff --git a/quapy/dataset/preprocessing.py b/quapy/data/preprocessing.py similarity index 97% rename from quapy/dataset/preprocessing.py rename to quapy/data/preprocessing.py index a6259b2..b08bcab 100644 --- a/quapy/dataset/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -1,9 +1,10 @@ import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from dataset.base import Dataset +from data.base import Dataset from scipy.sparse import spmatrix from utils.util import parallelize from .base import LabelledCollection +from tqdm import tqdm def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): @@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) consisting of lists of integer values representing indices. """ - __check_type(dataset.training.instances, list, str) - __check_type(dataset.test.instances, list, str) + __check_type(dataset.training.instances, np.ndarray, str) + __check_type(dataset.test.instances, np.ndarray, str) indexer = IndexTransformer(min_df=min_df, **kwargs) training_index = indexer.fit_transform(dataset.training.instances) @@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None): f'unexpected type of element (expected {container_type}, found {type(container)})' - class IndexTransformer: def __init__(self, **kwargs): @@ -140,7 +140,7 @@ class IndexTransformer: return self.fit(X).transform(X, n_jobs=n_jobs) def vocabulary_size(self): - return len(self.vocabulary_) + 1 # the reserved unk token + return len(self.vocabulary_) def add_word(self, word): if word in self.vocabulary_: diff --git a/quapy/dataset/reader.py b/quapy/data/reader.py similarity index 100% rename from quapy/dataset/reader.py rename to quapy/data/reader.py diff --git a/quapy/error.py b/quapy/error.py index ff9a6e0..f52540f 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -1,5 +1,8 @@ from sklearn.metrics import f1_score -from settings import SAMPLE_SIZE +import numpy as np + + +SAMPLE_SIZE = None def f1e(y_true, y_pred): @@ -7,8 +10,7 @@ def f1e(y_true, y_pred): def acce(y_true, y_pred): - acc = (y_true == y_pred).mean() - return 1. - acc + return 1. - (y_true == y_pred).mean() def mae(prevs, prevs_hat): @@ -20,11 +22,40 @@ def ae(p, p_hat): return abs(p_hat-p).mean(axis=-1) -def mrae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)): +def mse(prevs, prevs_hat): + return se(prevs, prevs_hat).mean() + + +def se(p, p_hat): + return ((p_hat-p)**2).mean(axis=-1) + + +def mkld(prevs, prevs_hat): + return kld(prevs, prevs_hat).mean() + + +def kld(p, p_hat, eps=None): + eps = __check_eps(eps) + sp = p+eps + sp_hat = p_hat + eps + return (sp*np.log(sp/sp_hat)).sum(axis=-1) + + +def mnkld(prevs, prevs_hat): + return nkld(prevs, prevs_hat).mean() + + +def nkld(p, p_hat, eps=None): + ekld = np.exp(kld(p, p_hat, eps)) + return 2. * ekld / (1 + ekld) - 1. + + +def mrae(p, p_hat, eps=None): return rae(p, p_hat, eps).mean() -def rae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)): +def rae(p, p_hat, eps=None): + eps = __check_eps(eps) p = smooth(p, eps) p_hat = smooth(p_hat, eps) return (abs(p-p_hat)/p).mean(axis=-1) @@ -35,8 +66,17 @@ def smooth(p, eps): return (p+eps)/(eps*n_classes + 1) +def __check_eps(eps): + if eps is None: + if SAMPLE_SIZE is None: + raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set') + else: + eps = 1. / (2. * SAMPLE_SIZE) + return eps + + CLASSIFICATION_ERROR = {f1e, acce} -QUANTIFICATION_ERROR = {mae, mrae} +QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld} f1_error = f1e acc_error = acce diff --git a/quapy/evaluation.py b/quapy/evaluation.py new file mode 100644 index 0000000..92c77c2 --- /dev/null +++ b/quapy/evaluation.py @@ -0,0 +1,53 @@ +from data import LabelledCollection +from method.base import BaseQuantifier +from utils.util import temp_seed +import numpy as np +from joblib import Parallel, delayed +from tqdm import tqdm + + +def artificial_sampling_prediction( + model: BaseQuantifier, + test: LabelledCollection, + sample_size, + prevalence_points=21, + point_repetitions=1, + n_jobs=-1, + random_seed=42): + """ + Performs the predictions for all samples generated according to the artificial sampling protocol. + :param model: the model in charge of generating the class prevalence estimations + :param test: the test set on which to perform arificial sampling + :param sample_size: the size of the samples + :param prevalence_points: the number of different prevalences to sample + :param point_repetitions: the number of repetitions for each prevalence + :param n_jobs: number of jobs to be run in parallel + :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect + any other random process. + :return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the + number of classes. The first one contains the true prevalences for the samples generated while the second one + containing the the prevalences estimations + """ + + with temp_seed(random_seed): + indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions)) + + def _predict_prevalences(index): + sample = test.sampling_from_index(index) + true_prevalence = sample.prevalence() + estim_prevalence = model.quantify(sample.instances) + return true_prevalence, estim_prevalence + + results = Parallel(n_jobs=n_jobs)( + delayed(_predict_prevalences)(index) for index in tqdm(indexes) + ) + + true_prevalences, estim_prevalences = zip(*results) + true_prevalences = np.asarray(true_prevalences) + estim_prevalences = np.asarray(estim_prevalences) + + return true_prevalences, estim_prevalences + + + + diff --git a/quapy/functional.py b/quapy/functional.py index f44a85b..48952b2 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -15,6 +15,26 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur return prevs +def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01): + """ + Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05 + and with the limits smoothed, i.e.: + [0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99] + :param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21) + :param repeat: number of times each prevalence is to be repeated (defaults to 1) + :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 + :return: an array of uniformly separated prevalence values + """ + p = np.linspace(0., 1., num=n_prevalences, endpoint=True) + p[0] += smooth_limits_epsilon + p[-1] -= smooth_limits_epsilon + if p[0] > p[1]: + raise ValueError(f'the smoothing in the limits is greater than the prevalence step') + if repeat > 1: + p = np.repeat(p, repeat) + return p + + def prevalence_from_labels(labels, n_classes): unique, counts = np.unique(labels, return_counts=True) by_class = defaultdict(lambda:0, dict(zip(unique, counts))) @@ -47,3 +67,54 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True): return adjusted +def normalize_prevalence(prevalences): + assert prevalences.ndim==1, 'unexpected shape' + accum = prevalences.sum() + if accum > 0: + return prevalences / accum + else: + # if all classifiers are trivial rejectors + return np.ones_like(prevalences) / prevalences.size + + +def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int): + """ + Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant + prevalences are generated and nrepeats repetitions are requested + :param nclasses: number of classes + :param nprevpoints: number of prevalence points. + :param nrepeats: number of repetitions for each prevalence combination + :return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number + of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] + """ + __cache={} + def __f(nc,np): + if (nc,np) in __cache: + return __cache[(nc,np)] + if nc==1: + return 1 + else: + x = sum([__f(nc-1, np-i) for i in range(np)]) + __cache[(nc,np)] = x + return x + return __f(nclasses, nprevpoints) * nrepeats + + +def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget): + """ + Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that + the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional + simplex) do not exceed combinations_budget. + :param nclasses: number of classes + :param nrepeats: number of repetitions for each prevalence combination + :param combinations_budget: maximum number of combinatios allowed + :return: the largest number of prevalence points that generate less than combinations_budget valid prevalences + """ + assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers' + nprevpoints = 1 + while True: + combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats) + if combinations > combinations_budget: + return nprevpoints-1 + else: + nprevpoints+=1 diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index a8e98d0..88acd16 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -1,5 +1,6 @@ +from . import base from . import aggregative as agg -from . import non_aggregative as nagg +from . import non_aggregative AGGREGATIVE_METHODS = { @@ -9,22 +10,14 @@ AGGREGATIVE_METHODS = { agg.ProbabilisticAdjustedClassifyAndCount, agg.ExplicitLossMinimisation, agg.ExpectationMaximizationQuantifier, + agg.HellingerDistanceY } NON_AGGREGATIVE_METHODS = { - nagg.MaximumLikelihoodPrevalenceEstimation + non_aggregative.MaximumLikelihoodPrevalenceEstimation } QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS -# common alisases -CC = agg.ClassifyAndCount -ACC = agg.AdjustedClassifyAndCount -PCC = agg.ProbabilisticClassifyAndCount -PACC = agg.ProbabilisticAdjustedClassifyAndCount -ELM = agg.ExplicitLossMinimisation -EMQ = agg.ExpectationMaximizationQuantifier -MLPE = nagg.MaximumLikelihoodPrevalenceEstimation - diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index ee16baf..1423fe6 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,12 +1,14 @@ import numpy as np -from .base import * -from ..error import mae +from copy import deepcopy import functional as F -from ..classification.svmperf import SVMperf -from ..dataset import LabelledCollection +import error +from method.base import BaseQuantifier +from quapy.classification.svmperf import SVMperf +from quapy.data import LabelledCollection from sklearn.metrics import confusion_matrix from sklearn.calibration import CalibratedClassifierCV from joblib import Parallel, delayed +from abc import abstractmethod # Abstract classes @@ -21,8 +23,16 @@ class AggregativeQuantifier(BaseQuantifier): @abstractmethod def fit(self, data: LabelledCollection, fit_learner=True, *args): ... - def classify(self, documents): - return self.learner.predict(documents) + @property + def learner(self): + return self.learner_ + + @learner.setter + def learner(self, value): + self.learner_ = value + + def classify(self, instances): + return self.learner.predict(instances) def get_params(self, deep=True): return self.learner.get_params() @@ -67,12 +77,12 @@ def training_helper(learner, Training procedure common to all Aggregative Quantifiers. :param learner: the learner to be fit :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. - :param fit_learner: whether or not to fit the learner + :param fit_learner: whether or not to fit the learner (if False, then bypasses any action) :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the learner is not probabilistic, then a CalibratedCV instance of it is trained) - :param train_val_split: if specified, indicates the proportion of training documents on which to fit the learner + :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 - or None otherwise) + or None otherwise) to be used as a validation set for any subsequent parameter fitting """ if fit_learner: if ensure_probabilistic: @@ -118,8 +128,8 @@ class ClassifyAndCount(AggregativeQuantifier): self.learner, _ = training_helper(self.learner, data, fit_learner) return self - def quantify(self, documents, *args): - classification = self.classify(documents) # classify + def quantify(self, instances, *args): + classification = self.classify(instances) # classify return F.prevalence_from_labels(classification, self.n_classes) # & count @@ -138,8 +148,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier): self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts() return self - def quantify(self, documents, *args): - prevs_estim = self.cc.quantify(documents) + def quantify(self, instances, *args): + prevs_estim = self.cc.quantify(instances) # solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim A = self.Pte_cond_estim_ B = prevs_estim @@ -163,8 +173,8 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier): self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) return self - def quantify(self, documents, *args): - posteriors = self.soft_classify(documents) # classify + def quantify(self, instances, *args): + posteriors = self.soft_classify(instances) # classify prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count return prevalences @@ -186,8 +196,8 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier): self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts() return self - def quantify(self, documents, *args): - prevs_estim = self.pcc.quantify(documents) + def quantify(self, instances, *args): + prevs_estim = self.pcc.quantify(instances) A = self.Pte_cond_estim_ B = prevs_estim try: @@ -237,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): # M-step: qs_pos is Ps+1(y=+1) qs = ps.mean(axis=0) - if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10: + if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10: converged = True qs_prev_ = qs @@ -252,79 +262,149 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): return qs -# todo: from here -def train_task(c, learners, data): - learners[c].fit(data.documents, data.labels == c) +class HellingerDistanceY(AggregativeProbabilisticQuantifier): + """ + Implementation of the method based on the Hellinger Distance y (HDy) proposed by + González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution + estimation based on the Hellinger distance. Information Sciences, 218:146–164. + """ + + def __init__(self, learner): + self.learner = learner + + def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): + assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \ + f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.' + self.learner, validation = training_helper( + self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) + Px = self.soft_classify(validation.instances) + self.Pxy1 = Px[validation.labels == 1] + self.Pxy0 = Px[validation.labels == 0] + return self + + def quantify(self, instances, *args): + # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10, + # and the final estimated a priori probability was taken as the median of these 11 estimates." + # (González-Castro, et al., 2013). + + Px = self.soft_classify(instances) + + prev_estimations = [] + for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] + Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) + Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) + + Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True) + + prev_selected, min_dist = None, None + for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0): + Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density + hdy = HellingerDistanceY.HellingerDistance(Px_train, Px_test) + if prev_selected is None or hdy < min_dist: + prev_selected, min_dist = prev, hdy + prev_estimations.append(prev_selected) + + pos_class_prev = np.median(prev_estimations) + return np.asarray([1-pos_class_prev, pos_class_prev]) + + @classmethod + def HellingerDistance(cls, P, Q): + return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2)) -def binary_quant_task(c, learners, X): - predictions_ci = learners[c].predict(X) - return predictions_ci.mean() # since the predictions array is binary +class OneVsAll(AggregativeQuantifier): + """ + Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary + quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. + """ + + def __init__(self, binary_method, n_jobs=-1): + self.binary_method = binary_method + self.n_jobs = n_jobs + + def fit(self, data: LabelledCollection, **kwargs): + assert not data.binary, f'{self.__class__.__name__} expect non-binary data' + assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier' + self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_} + Parallel(n_jobs=self.n_jobs, backend='threading')( + delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_ + ) + return self + + def quantify(self, X, *args): + prevalences = np.asarray( + Parallel(n_jobs=self.n_jobs, backend='threading')( + delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes + ) + ) + return F.normalize_prevalence(prevalences) + + @property + def classes(self): + return sorted(self.class_method.keys()) + + def set_params(self, **parameters): + self.binary_method.set_params(**parameters) + + def get_params(self, deep=True): + return self.binary_method.get_params() + + def _delayed_binary_predict(self, c, learners, X): + return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence + + def _delayed_binary_fit(self, c, learners, data, **kwargs): + bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) + learners[c].fit(bindata, **kwargs) -class OneVsAllELM(AggregativeQuantifier): +class ExplicitLossMinimisation(AggregativeQuantifier): + """ + A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary + quantifier for each class and then l1-normalizes the class predictions so that they sum up to one. + This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. + Social Network Analysis and Mining6(19), 1–22 (2016) + """ - def __init__(self, svmperf_base, loss, n_jobs=-1, **kwargs): + def __init__(self, svmperf_base, loss, **kwargs): self.svmperf_base = svmperf_base self.loss = loss - self.n_jobs = n_jobs self.kwargs = kwargs def fit(self, data: LabelledCollection, fit_learner=True, *args): assert fit_learner, 'the method requires that fit_learner=True' + self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) + if not data.binary: + self.learner = OneVsAll(self.learner, n_jobs=-1) + return self.learner.fit(data, *args) - self.learners = {c: SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs) for c in data.classes_} - Parallel(n_jobs=self.n_jobs, backend='threading')( - delayed(train_task)(c, self.learners, data) for c in self.learners.keys() - ) - return self - - def quantify(self, X, y=None): - prevalences = np.asarray( - Parallel(n_jobs=self.n_jobs, backend='threading')( - delayed(binary_quant_task)(c, self.learners, X) for c in self.learners.keys() - ) - ) - prevalences /= prevalences.sum() - return prevalences - - @property - def classes(self): - return sorted(self.learners.keys()) - - def preclassify_collection(self, data: LabelledCollection): - classifications = [] - for class_ in data.classes_: - classifications.append(self.learners[class_].predict(data.instances)) - classifications = np.vstack(classifications).T - precomputed = LabelledCollection(classifications, data.labels) - return precomputed - - def set_params(self, **parameters): - self.kwargs=parameters - - def get_params(self, deep=True): - return self.kwargs + def quantify(self, instances, *args): + return self.learner.quantify(instances, *args) -class ExplicitLossMinimisation(AggregativeQuantifier): +class ExplicitLossMinimisationBinary(AggregativeQuantifier): def __init__(self, svmperf_base, loss, **kwargs): - self.learner = SVMperf(svmperf_base, loss=loss, **kwargs) + self.svmperf_base = svmperf_base + self.loss = loss + self.kwargs = kwargs def fit(self, data: LabelledCollection, fit_learner=True, *args): + assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' assert fit_learner, 'the method requires that fit_learner=True' - self.learner.fit(data.instances, data.labels) + self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels) return self def quantify(self, X, y=None): predictions = self.learner.predict(X) - return F.prevalence_from_labels(predictions, self.learner.n_classes_) + prev = F.prevalence_from_labels(predictions, self.learner.n_classes_) + print('binary: ', prev) + return prev def classify(self, X, y=None): return self.learner.predict(X) + class SVMQ(ExplicitLossMinimisation): def __init__(self, svmperf_base, **kwargs): super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) @@ -349,3 +429,12 @@ class SVMRAE(ExplicitLossMinimisation): def __init__(self, svmperf_base, **kwargs): super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) + +CC = ClassifyAndCount +ACC = AdjustedClassifyAndCount +PCC = ProbabilisticClassifyAndCount +PACC = ProbabilisticAdjustedClassifyAndCount +ELM = ExplicitLossMinimisation +EMQ = ExpectationMaximizationQuantifier +HDy = HellingerDistanceY + diff --git a/quapy/method/base.py b/quapy/method/base.py index 4679a8f..e65b45e 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -1,5 +1,4 @@ from abc import ABCMeta, abstractmethod -import quapy as qp # Base Quantifier abstract class @@ -7,10 +6,10 @@ import quapy as qp class BaseQuantifier(metaclass=ABCMeta): @abstractmethod - def fit(self, data: qp.LabelledCollection, *args): ... + def fit(self, data, *args): ... @abstractmethod - def quantify(self, documents, *args): ... + def quantify(self, instances, *args): ... @abstractmethod def set_params(self, **parameters): ... diff --git a/quapy/utils/__init__.py b/quapy/utils/__init__.py new file mode 100644 index 0000000..907cc97 --- /dev/null +++ b/quapy/utils/__init__.py @@ -0,0 +1 @@ +from . import util \ No newline at end of file diff --git a/quapy/utils/util.py b/quapy/utils/util.py new file mode 100644 index 0000000..8b6f67f --- /dev/null +++ b/quapy/utils/util.py @@ -0,0 +1,35 @@ +import itertools +import multiprocessing +from joblib import Parallel, delayed +import contextlib +import numpy as np + + +def get_parallel_slices(n_tasks, n_jobs=-1): + if n_jobs == -1: + n_jobs = multiprocessing.cpu_count() + batch = int(n_tasks / n_jobs) + remainder = n_tasks % n_jobs + return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in + range(n_jobs)] + + +def parallelize(func, args, n_jobs): + args = np.asarray(args) + slices = get_parallel_slices(len(args), n_jobs) + results = Parallel(n_jobs=n_jobs)( + delayed(func)(args[slice_i]) for slice_i in slices + ) + return list(itertools.chain.from_iterable(results)) + + +@contextlib.contextmanager +def temp_seed(seed): + state = np.random.get_state() + np.random.seed(seed) + try: + yield + finally: + np.random.set_state(state) + + diff --git a/test.py b/test.py new file mode 100644 index 0000000..679e069 --- /dev/null +++ b/test.py @@ -0,0 +1,53 @@ +from sklearn.linear_model import LogisticRegression +from sklearn.svm import LinearSVC +import quapy as qp +import quapy.functional as F + +SAMPLE_SIZE=500 +binary = False + +if binary: + # load a textual binary dataset and create a tfidf bag of words + train_path = './datasets/reviews/kindle/train.txt' + test_path = './datasets/reviews/kindle/test.txt' + dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) + qp.preprocessing.text2tfidf(dataset, inplace=True) + qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) + +else: + # load a sparse matrix ternary dataset + train_path = './datasets/twitter/train/sst.train+dev.feature.txt' + test_path = './datasets/twitter/test/sst.test.feature.txt' + dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) + +# training a quantifier +learner = LogisticRegression() +model = qp.method.aggregative.ClassifyAndCount(learner) +# model = qp.method.aggregative.AdjustedClassifyAndCount(learner) +# model = qp.method.aggregative.AdjustedClassifyAndCount(learner) +# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner) +# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner) +# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) +model.fit(dataset.training) + +# estimating class prevalences +prevalences_estim = model.quantify(dataset.test.instances) +prevalences_true = dataset.test.prevalence() + +# evaluation (one single prediction) +error = qp.error.mae(prevalences_true, prevalences_estim) + +print(f'method {model.__class__.__name__}') + +print(f'Evaluation in test (1 eval)') +print(f'true prevalence {F.strprev(prevalences_true)}') +print(f'estim prevalence {F.strprev(prevalences_estim)}') +print(f'mae={error:.3f}') + +true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE) + +qp.error.SAMPLE_SIZE=SAMPLE_SIZE +print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)') +for error in qp.error.QUANTIFICATION_ERROR: + score = error(true_prev, estim_prev) + print(f'{error.__name__}={score:.5f}')