diff --git a/TODO.txt b/TODO.txt index cee90fa..7837c70 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,11 +1,8 @@ Documentation with sphinx -Add evaluation - artificial sampling Add quantification_report (akin to classification_report from sklearn) Add optimization - artificial sampling -Add prediction - artificial sampling -Add readers for typical datasets used in Quantification Add NAE, NRAE Add "measures for evaluating ordinal"? Document methods with paper references -The parallel training in svmperf seems not to work +The parallel training in svmperf seems not to work (not sure...) diff --git a/quapy/data/__init__.py b/quapy/data/__init__.py index e44efa4..9c119ab 100644 --- a/quapy/data/__init__.py +++ b/quapy/data/__init__.py @@ -1,5 +1,6 @@ from .base import * from .reader import * from . import preprocessing +from . import datasets diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py new file mode 100644 index 0000000..2c25de9 --- /dev/null +++ b/quapy/data/datasets.py @@ -0,0 +1,83 @@ +import zipfile +from utils.util import download_file_if_not_exists, download_file, get_quapy_home +import os +from os.path import join +from data.base import Dataset, LabelledCollection +from data.reader import from_text, from_sparse +from data.preprocessing import text2tfidf, reduce_columns + + +REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] +TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16', + 'sst', 'wa', 'wb'] + + +def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None): + assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \ + f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \ + f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt' + URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt' + os.makedirs(join(data_home, 'reviews'), exist_ok=True) + train_path = join(data_home, 'reviews', dataset_name, 'train.txt') + test_path = join(data_home, 'reviews', dataset_name, 'test.txt') + download_file_if_not_exists(URL_TRAIN, train_path) + download_file_if_not_exists(URL_TEST, test_path) + + data = Dataset.load(train_path, test_path, from_text) + + if tfidf: + text2tfidf(data, inplace=True) + + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) + + return data + + +def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None): + assert dataset_name in TWITTER_SENTIMENT_DATASETS, \ + f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \ + f'Valid ones are {TWITTER_SENTIMENT_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip' + unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam') + if not os.path.exists(unzipped_path): + downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip') + download_file(URL, downloaded_path) + with zipfile.ZipFile(downloaded_path) as file: + file.extractall(data_home) + os.remove(downloaded_path) + + if dataset_name in {'semeval13', 'semeval14', 'semeval15'}: + trainset_name = 'semeval' + testset_name = 'semeval' if model_selection else dataset_name + print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common " + f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}") + else: + trainset_name = testset_name = dataset_name + + if model_selection: + train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt') + test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt') + else: + train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt') + if dataset_name == 'semeval16': + test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt') + else: + test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt') + + data = Dataset.load(train, test, from_sparse) + + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) + + return data + + + diff --git a/quapy/data/reader.py b/quapy/data/reader.py index e160d15..84550c6 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -54,3 +54,4 @@ def from_sparse(path): X = X.tocsr() y = np.asarray(all_labels) + 1 return X, y + diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 92c77c2..106eb11 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -1,4 +1,5 @@ from data import LabelledCollection +from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier from method.base import BaseQuantifier from utils.util import temp_seed import numpy as np @@ -10,8 +11,8 @@ def artificial_sampling_prediction( model: BaseQuantifier, test: LabelledCollection, sample_size, - prevalence_points=21, - point_repetitions=1, + n_prevpoints=210, + n_repetitions=1, n_jobs=-1, random_seed=42): """ @@ -19,27 +20,40 @@ def artificial_sampling_prediction( :param model: the model in charge of generating the class prevalence estimations :param test: the test set on which to perform arificial sampling :param sample_size: the size of the samples - :param prevalence_points: the number of different prevalences to sample - :param point_repetitions: the number of repetitions for each prevalence + :param n_prevpoints: the number of different prevalences to sample + :param n_repetitions: the number of repetitions for each prevalence :param n_jobs: number of jobs to be run in parallel :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect any other random process. - :return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the + :return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the number of classes. The first one contains the true prevalences for the samples generated while the second one containing the the prevalences estimations """ with temp_seed(random_seed): - indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions)) + indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions)) + + if isinstance(model, AggregativeQuantifier): + quantification_func = model.aggregate + if isinstance(model, AggregativeProbabilisticQuantifier): + print('\tpreclassifying with soft') + preclassified_instances = model.posterior_probabilities(test.instances) + else: + print('\tpreclassifying with hard') + preclassified_instances = model.classify(test.instances) + test = LabelledCollection(preclassified_instances, test.labels) + else: + quantification_func = model.quantify + print('not an aggregative') def _predict_prevalences(index): sample = test.sampling_from_index(index) true_prevalence = sample.prevalence() - estim_prevalence = model.quantify(sample.instances) + estim_prevalence = quantification_func(sample.instances) return true_prevalence, estim_prevalence results = Parallel(n_jobs=n_jobs)( - delayed(_predict_prevalences)(index) for index in tqdm(indexes) + delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting') ) true_prevalences, estim_prevalences = zip(*results) diff --git a/quapy/functional.py b/quapy/functional.py index d235b6b..c351990 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -36,6 +36,8 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01): def prevalence_from_labels(labels, n_classes): + if labels.ndim != 1: + raise ValueError(f'param labels does not seem to be a ndarray of label predictions') unique, counts = np.unique(labels, return_counts=True) by_class = defaultdict(lambda:0, dict(zip(unique, counts))) prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float) @@ -44,6 +46,8 @@ def prevalence_from_labels(labels, n_classes): def prevalence_from_probabilities(posteriors, binarize: bool = False): + if posteriors.ndim != 2: + raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities') if binarize: predictions = np.argmax(posteriors, axis=-1) return prevalence_from_labels(predictions, n_classes=posteriors.shape[1]) @@ -78,15 +82,15 @@ def normalize_prevalence(prevalences): -def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int): +def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1): """ - Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant - prevalences are generated and nrepeats repetitions are requested - :param nclasses: number of classes - :param nprevpoints: number of prevalence points. - :param nrepeats: number of repetitions for each prevalence combination - :return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number - of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] + Computes the number of prevalence combinations in the n_classes-dimensional simplex if nprevpoints equally distant + prevalences are generated and n_repeats repetitions are requested + :param n_classes: number of classes + :param n_prevpoints: number of prevalence points. + :param n_repeats: number of repetitions for each prevalence combination + :return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the + number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] """ __cache={} def __f(nc,np): @@ -98,25 +102,25 @@ def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int): x = sum([__f(nc-1, np-i) for i in range(np)]) __cache[(nc,np)] = x return x - return __f(nclasses, nprevpoints) * nrepeats + return __f(n_classes, n_prevpoints) * n_repeats -def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget): +def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1): """ - Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that - the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional + Searches for the largest number of (equidistant) prevalence points to define for each of the n_classes classes so that + the number of valid prevalences generated as combinations of prevalence points (points in a n_classes-dimensional simplex) do not exceed combinations_budget. - :param nclasses: number of classes - :param nrepeats: number of repetitions for each prevalence combination + :param n_classes: number of classes + :param n_repeats: number of repetitions for each prevalence combination :param combinations_budget: maximum number of combinatios allowed :return: the largest number of prevalence points that generate less than combinations_budget valid prevalences """ - assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers' - nprevpoints = 1 + assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers' + n_prevpoints = 1 while True: - combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats) + combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats) if combinations > combinations_budget: - return nprevpoints-1 + return n_prevpoints-1 else: - nprevpoints+=1 + n_prevpoints += 1 diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index 88acd16..b5bbd72 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -8,7 +8,7 @@ AGGREGATIVE_METHODS = { agg.AdjustedClassifyAndCount, agg.ProbabilisticClassifyAndCount, agg.ProbabilisticAdjustedClassifyAndCount, - agg.ExplicitLossMinimisation, + agg.ExplicitLossMinimisationBinary, agg.ExpectationMaximizationQuantifier, agg.HellingerDistanceY } diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index c2857f7..f2d2756 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -34,6 +34,13 @@ class AggregativeQuantifier(BaseQuantifier): def classify(self, instances): return self.learner.predict(instances) + def quantify(self, instances, *args): + classif_predictions = self.classify(instances) + return self.aggregate(classif_predictions, *args) + + @abstractmethod + def aggregate(self, classif_predictions:np.ndarray, *args): ... + def get_params(self, deep=True): return self.learner.get_params() @@ -53,13 +60,17 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): """ Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative - Quantifiersimplement by implementing a _soft_classify_ method returning values in [0,1] -- the posterior + Quantifiersimplement by implementing a _posterior_probabilities_ method returning values in [0,1] -- the posterior probabilities. """ - def soft_classify(self, data): + def posterior_probabilities(self, data): return self.learner.predict_proba(data) + def quantify(self, instances, *args): + classif_posteriors = self.posterior_probabilities(instances) + return self.aggregate(classif_posteriors, *args) + def set_params(self, **parameters): if isinstance(self.learner, CalibratedClassifierCV): parameters={'base_estimator__'+k:v for k,v in parameters.items()} @@ -128,9 +139,8 @@ class ClassifyAndCount(AggregativeQuantifier): self.learner, _ = training_helper(self.learner, data, fit_learner) return self - def quantify(self, instances, *args): - classification = self.classify(instances) # classify - return F.prevalence_from_labels(classification, self.n_classes) # & count + def aggregate(self, classif_predictions, *args): + return F.prevalence_from_labels(classif_predictions, self.n_classes) class AdjustedClassifyAndCount(AggregativeQuantifier): @@ -141,17 +151,24 @@ class AdjustedClassifyAndCount(AggregativeQuantifier): def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split) self.cc = ClassifyAndCount(self.learner) - y_ = self.cc.classify(validation.instances) + y_ = self.classify(validation.instances) y = validation.labels # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts() return self - def quantify(self, instances, *args): - prevs_estim = self.cc.quantify(instances) - # solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim - A = self.Pte_cond_estim_ + def classify(self, data): + return self.cc.classify(data) + + def aggregate(self, classif_predictions, *args): + prevs_estim = self.cc.aggregate(classif_predictions) + return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim) + + @classmethod + def solve_adjustment(cls, PteCondEstim, prevs_estim): + # solve for the linear system Ax = B with A=PteCondEstim and B = prevs_estim + A = PteCondEstim B = prevs_estim try: adjusted_prevs = np.linalg.solve(A, B) @@ -161,9 +178,6 @@ class AdjustedClassifyAndCount(AggregativeQuantifier): adjusted_prevs = prevs_estim # no way to adjust them! return adjusted_prevs - def classify(self, data): - return self.cc.classify(data) - class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier): def __init__(self, learner): @@ -173,13 +187,11 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier): self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) return self - def quantify(self, instances, *args): - posteriors = self.soft_classify(instances) # classify - prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count - return prevalences + def aggregate(self, classif_posteriors, *args): + return F.prevalence_from_probabilities(classif_posteriors, binarize=False) -class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier): +class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier): def __init__(self, learner): self.learner = learner @@ -189,28 +201,23 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier): self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split ) self.pcc = ProbabilisticClassifyAndCount(self.learner) - y_ = self.pcc.classify(validation.instances) + y_ = self.classify(validation.instances) y = validation.labels # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts() return self - def quantify(self, instances, *args): - prevs_estim = self.pcc.quantify(instances) - A = self.Pte_cond_estim_ - B = prevs_estim - try: - adjusted_prevs = np.linalg.solve(A, B) - adjusted_prevs = np.clip(adjusted_prevs, 0, 1) - adjusted_prevs /= adjusted_prevs.sum() - except np.linalg.LinAlgError: - adjusted_prevs = prevs_estim # no way to adjust them! - return adjusted_prevs + def aggregate(self, classif_posteriors, *args): + prevs_estim = self.pcc.aggregate(classif_posteriors) + return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim) def classify(self, data): return self.pcc.classify(data) + def soft_classify(self, data): + return self.pcc.posterior_probabilities(data) + class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): @@ -226,10 +233,8 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes) return self - def quantify(self, X, epsilon=EPSILON): - tr_prev=self.train_prevalence - posteriors = self.soft_classify(X) - return self.EM(tr_prev, posteriors, self.verbose, epsilon) + def aggregate(self, classif_posteriors, epsilon=EPSILON): + return self.EM(self.train_prevalence, classif_posteriors, self.verbose, epsilon) @classmethod def EM(cls, tr_prev, posterior_probabilities, verbose=False, epsilon=EPSILON): @@ -277,17 +282,17 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier): f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.' self.learner, validation = training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) - Px = self.soft_classify(validation.instances) + Px = self.posterior_probabilities(validation.instances) self.Pxy1 = Px[validation.labels == 1] self.Pxy0 = Px[validation.labels == 0] return self - def quantify(self, instances, *args): + def aggregate(self, classif_posteriors, *args): # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10, # and the final estimated a priori probability was taken as the median of these 11 estimates." # (González-Castro, et al., 2013). - Px = self.soft_classify(instances) + Px = classif_posteriors prev_estimations = [] for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] @@ -318,71 +323,87 @@ class OneVsAll(AggregativeQuantifier): quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. """ - def __init__(self, binary_method, n_jobs=-1): - self.binary_method = binary_method + def __init__(self, binary_quantifier, n_jobs=-1): + self.binary_quantifier = binary_quantifier self.n_jobs = n_jobs def fit(self, data: LabelledCollection, **kwargs): - assert not data.binary, f'{self.__class__.__name__} expect non-binary data' - assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier' - self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_} - Parallel(n_jobs=self.n_jobs, backend='threading')( - delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_ - ) + assert not data.binary, \ + f'{self.__class__.__name__} expect non-binary data' + assert isinstance(self.binary_quantifier, BaseQuantifier), \ + f'{self.binary_quantifier} does not seem to be a Quantifier' + self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} + self.__parallel(self._delayed_binary_fit, data, **kwargs) return self + def classify(self, instances): + classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances) + return classif_predictions_bin.T + + def aggregate(self, classif_predictions_bin, *args): + assert set(np.unique(classif_predictions_bin)) == {0,1}, \ + 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \ + 'predictions for each document (row) and class (columns)' + prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin) + return F.normalize_prevalence(prevalences) + def quantify(self, X, *args): - prevalences = np.asarray( + prevalences = self.__parallel(self._delayed_binary_quantify, X) + return F.normalize_prevalence(prevalences) + + def __parallel(self, func, *args, **kwargs): + return np.asarray( Parallel(n_jobs=self.n_jobs, backend='threading')( - delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes + delayed(func)(c, *args, **kwargs) for c in self.classes ) ) -<<<<<<< HEAD -======= - print('one vs all: ', prevalences) ->>>>>>> 2361186a01c53e744f4291e2e2299700216ff139 - return F.normalize_prevalence(prevalences) @property def classes(self): - return sorted(self.class_method.keys()) + return sorted(self.dict_binary_quantifiers.keys()) def set_params(self, **parameters): - self.binary_method.set_params(**parameters) + self.binary_quantifier.set_params(**parameters) def get_params(self, deep=True): - return self.binary_method.get_params() + return self.binary_quantifier.get_params() - def _delayed_binary_predict(self, c, learners, X): - return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence + def _delayed_binary_classification(self, c, X): + return self.dict_binary_quantifiers[c].classify(X) - def _delayed_binary_fit(self, c, learners, data, **kwargs): + def _delayed_binary_quantify(self, c, X): + return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence + + def _delayed_binary_aggregate(self, c, classif_predictions): + return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence + + def _delayed_binary_fit(self, c, data, **kwargs): bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) - learners[c].fit(bindata, **kwargs) + self.dict_binary_quantifiers[c].fit(bindata, **kwargs) -class ExplicitLossMinimisation(AggregativeQuantifier): - """ - A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary - quantifier for each class and then l1-normalizes the class predictions so that they sum up to one. - This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. - Social Network Analysis and Mining6(19), 1–22 (2016) - """ - - def __init__(self, svmperf_base, loss, **kwargs): - self.svmperf_base = svmperf_base - self.loss = loss - self.kwargs = kwargs - - def fit(self, data: LabelledCollection, fit_learner=True, *args): - assert fit_learner, 'the method requires that fit_learner=True' - self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) - if not data.binary: - self.learner = OneVsAll(self.learner, n_jobs=-1) - return self.learner.fit(data, *args) - - def quantify(self, instances, *args): - return self.learner.quantify(instances, *args) +# class ExplicitLossMinimisation(AggregativeQuantifier): +# """ +# A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary +# quantifier for each class and then l1-normalizes the class predictions so that they sum up to one. +# This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. +# Social Network Analysis and Mining6(19), 1–22 (2016) +# """ +# +# def __init__(self, svmperf_base, loss, **kwargs): +# self.svmperf_base = svmperf_base +# self.loss = loss +# self.kwargs = kwargs +# +# def fit(self, data: LabelledCollection, fit_learner=True, *args): +# assert fit_learner, 'the method requires that fit_learner=True' +# self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) +# if not data.binary: +# self.learner = OneVsAll(self.learner, n_jobs=-1) +# return self.learner.fit(data, *args) +# +# def aggregate(self, instances, *args): +# return self.learner.aggregate(instances, *args) class ExplicitLossMinimisationBinary(AggregativeQuantifier): @@ -398,38 +419,35 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier): self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels) return self - def quantify(self, X, y=None): - predictions = self.learner.predict(X) - prev = F.prevalence_from_labels(predictions, self.learner.n_classes_) - print('binary: ', prev) - return prev + def aggregate(self, classif_predictions:np.ndarray, *args): + return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_) def classify(self, X, y=None): return self.learner.predict(X) -class SVMQ(ExplicitLossMinimisation): +class SVMQ(ExplicitLossMinimisationBinary): def __init__(self, svmperf_base, **kwargs): super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) -class SVMKLD(ExplicitLossMinimisation): +class SVMKLD(ExplicitLossMinimisationBinary): def __init__(self, svmperf_base, **kwargs): super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs) -class SVMNKLD(ExplicitLossMinimisation): +class SVMNKLD(ExplicitLossMinimisationBinary): def __init__(self, svmperf_base, **kwargs): super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs) -class SVMAE(ExplicitLossMinimisation): +class SVMAE(ExplicitLossMinimisationBinary): def __init__(self, svmperf_base, **kwargs): super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs) -class SVMRAE(ExplicitLossMinimisation): +class SVMRAE(ExplicitLossMinimisationBinary): def __init__(self, svmperf_base, **kwargs): super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) @@ -438,7 +456,7 @@ CC = ClassifyAndCount ACC = AdjustedClassifyAndCount PCC = ProbabilisticClassifyAndCount PACC = ProbabilisticAdjustedClassifyAndCount -ELM = ExplicitLossMinimisation +ELM = ExplicitLossMinimisationBinary EMQ = ExpectationMaximizationQuantifier HDy = HellingerDistanceY diff --git a/quapy/method/base.py b/quapy/method/base.py index e65b45e..9561a27 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -18,3 +18,48 @@ class BaseQuantifier(metaclass=ABCMeta): def get_params(self, deep=True): ... +# class OneVsAll(AggregativeQuantifier): +# """ +# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary +# quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. +# """ +# +# def __init__(self, binary_method, n_jobs=-1): +# self.binary_method = binary_method +# self.n_jobs = n_jobs +# +# def fit(self, data: LabelledCollection, **kwargs): +# assert not data.binary, f'{self.__class__.__name__} expect non-binary data' +# assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier' +# self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_} +# Parallel(n_jobs=self.n_jobs, backend='threading')( +# delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_ +# ) +# return self +# +# def quantify(self, X, *args): +# prevalences = np.asarray( +# Parallel(n_jobs=self.n_jobs, backend='threading')( +# delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes +# ) +# ) +# return F.normalize_prevalence(prevalences) +# +# @property +# def classes(self): +# return sorted(self.class_method.keys()) +# +# def set_params(self, **parameters): +# self.binary_method.set_params(**parameters) +# +# def get_params(self, deep=True): +# return self.binary_method.get_params() +# +# def _delayed_binary_predict(self, c, learners, X): +# return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence +# +# def _delayed_binary_fit(self, c, learners, data, **kwargs): +# bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) +# learners[c].fit(bindata, **kwargs) + + diff --git a/quapy/utils/util.py b/quapy/utils/util.py index 583cb1a..921ab1b 100644 --- a/quapy/utils/util.py +++ b/quapy/utils/util.py @@ -3,6 +3,10 @@ import multiprocessing from joblib import Parallel, delayed import contextlib import numpy as np +import urllib +import os +from pathlib import Path + @@ -33,3 +37,27 @@ def temp_seed(seed): finally: np.random.set_state(state) + +def download_file(url, archive_filename): + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + print("Downloading %s" % url) + urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) + print("") + + +def download_file_if_not_exists(url, archive_path): + if os.path.exists(archive_path): + return + create_if_not_exist(os.path.dirname(archive_path)) + download_file(url,archive_path) + + +def create_if_not_exist(path): + os.makedirs(path, exist_ok=True) + + +def get_quapy_home(): + return os.path.join(str(Path.home()), 'quapy_data') \ No newline at end of file diff --git a/test.py b/test.py index b6cb243..85d8bb6 100644 --- a/test.py +++ b/test.py @@ -2,37 +2,45 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import quapy as qp import quapy.functional as F +import sys +#qp.datasets.fetch_reviews('hp') +#qp.datasets.fetch_twitter('sst') + +#sys.exit() SAMPLE_SIZE=500 binary = False +svmperf_home = './svm_perf_quantification' if binary: - # load a textual binary dataset and create a tfidf bag of words - train_path = './datasets/reviews/kindle/train.txt' - test_path = './datasets/reviews/kindle/test.txt' - dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) - qp.preprocessing.text2tfidf(dataset, inplace=True) - qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) + dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) else: - # load a sparse matrix ternary dataset - train_path = './datasets/twitter/train/sst.train+dev.feature.txt' - test_path = './datasets/twitter/test/sst.test.feature.txt' - dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) + dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10) + dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3) + +print('dataset loaded') # training a quantifier learner = LogisticRegression() -model = qp.method.aggregative.ClassifyAndCount(learner) -# model = qp.method.aggregative.AdjustedClassifyAndCount(learner) +# model = qp.method.aggregative.ClassifyAndCount(learner) # model = qp.method.aggregative.AdjustedClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner) # model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) +# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100) +model = qp.method.aggregative.SVMQ(svmperf_home, C=1) +if not binary: + model = qp.method.aggregative.OneVsAll(model) + +print('fitting model') model.fit(dataset.training) + # estimating class prevalences +print('quantifying') prevalences_estim = model.quantify(dataset.test.instances) prevalences_true = dataset.test.prevalence() @@ -46,9 +54,17 @@ print(f'true prevalence {F.strprev(prevalences_true)}') print(f'estim prevalence {F.strprev(prevalences_estim)}') print(f'mae={error:.3f}') -true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE) -qp.error.SAMPLE_SIZE=SAMPLE_SIZE +max_evaluations = 5000 +n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes) +n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes) +print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n' + f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n' + f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.') + +true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints) + +qp.error.SAMPLE_SIZE = SAMPLE_SIZE print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)') for error in qp.error.QUANTIFICATION_ERROR: score = error(true_prev, estim_prev)