This commit is contained in:
Alejandro Moreo Fernandez 2020-12-15 13:39:43 +01:00
commit d6edfe983e
11 changed files with 345 additions and 138 deletions

View File

@ -1,11 +1,8 @@
Documentation with sphinx Documentation with sphinx
Add evaluation - artificial sampling
Add quantification_report (akin to classification_report from sklearn) Add quantification_report (akin to classification_report from sklearn)
Add optimization - artificial sampling Add optimization - artificial sampling
Add prediction - artificial sampling
Add readers for typical datasets used in Quantification
Add NAE, NRAE Add NAE, NRAE
Add "measures for evaluating ordinal"? Add "measures for evaluating ordinal"?
Document methods with paper references Document methods with paper references
The parallel training in svmperf seems not to work The parallel training in svmperf seems not to work (not sure...)

View File

@ -1,5 +1,6 @@
from .base import * from .base import *
from .reader import * from .reader import *
from . import preprocessing from . import preprocessing
from . import datasets

83
quapy/data/datasets.py Normal file
View File

@ -0,0 +1,83 @@
import zipfile
from utils.util import download_file_if_not_exists, download_file, get_quapy_home
import os
from os.path import join
from data.base import Dataset, LabelledCollection
from data.reader import from_text, from_sparse
from data.preprocessing import text2tfidf, reduce_columns
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
'sst', 'wa', 'wb']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
download_file_if_not_exists(URL_TRAIN, train_path)
download_file_if_not_exists(URL_TEST, test_path)
data = Dataset.load(train_path, test_path, from_text)
if tfidf:
text2tfidf(data, inplace=True)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
return data
def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
if not os.path.exists(unzipped_path):
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
download_file(URL, downloaded_path)
with zipfile.ZipFile(downloaded_path) as file:
file.extractall(data_home)
os.remove(downloaded_path)
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
trainset_name = 'semeval'
testset_name = 'semeval' if model_selection else dataset_name
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
else:
trainset_name = testset_name = dataset_name
if model_selection:
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
else:
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
if dataset_name == 'semeval16':
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
else:
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
data = Dataset.load(train, test, from_sparse)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
return data

View File

@ -54,3 +54,4 @@ def from_sparse(path):
X = X.tocsr() X = X.tocsr()
y = np.asarray(all_labels) + 1 y = np.asarray(all_labels) + 1
return X, y return X, y

View File

@ -1,4 +1,5 @@
from data import LabelledCollection from data import LabelledCollection
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier
from method.base import BaseQuantifier from method.base import BaseQuantifier
from utils.util import temp_seed from utils.util import temp_seed
import numpy as np import numpy as np
@ -10,8 +11,8 @@ def artificial_sampling_prediction(
model: BaseQuantifier, model: BaseQuantifier,
test: LabelledCollection, test: LabelledCollection,
sample_size, sample_size,
prevalence_points=21, n_prevpoints=210,
point_repetitions=1, n_repetitions=1,
n_jobs=-1, n_jobs=-1,
random_seed=42): random_seed=42):
""" """
@ -19,27 +20,40 @@ def artificial_sampling_prediction(
:param model: the model in charge of generating the class prevalence estimations :param model: the model in charge of generating the class prevalence estimations
:param test: the test set on which to perform arificial sampling :param test: the test set on which to perform arificial sampling
:param sample_size: the size of the samples :param sample_size: the size of the samples
:param prevalence_points: the number of different prevalences to sample :param n_prevpoints: the number of different prevalences to sample
:param point_repetitions: the number of repetitions for each prevalence :param n_repetitions: the number of repetitions for each prevalence
:param n_jobs: number of jobs to be run in parallel :param n_jobs: number of jobs to be run in parallel
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
any other random process. any other random process.
:return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the :return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
number of classes. The first one contains the true prevalences for the samples generated while the second one number of classes. The first one contains the true prevalences for the samples generated while the second one
containing the the prevalences estimations containing the the prevalences estimations
""" """
with temp_seed(random_seed): with temp_seed(random_seed):
indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions)) indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
if isinstance(model, AggregativeQuantifier):
quantification_func = model.aggregate
if isinstance(model, AggregativeProbabilisticQuantifier):
print('\tpreclassifying with soft')
preclassified_instances = model.posterior_probabilities(test.instances)
else:
print('\tpreclassifying with hard')
preclassified_instances = model.classify(test.instances)
test = LabelledCollection(preclassified_instances, test.labels)
else:
quantification_func = model.quantify
print('not an aggregative')
def _predict_prevalences(index): def _predict_prevalences(index):
sample = test.sampling_from_index(index) sample = test.sampling_from_index(index)
true_prevalence = sample.prevalence() true_prevalence = sample.prevalence()
estim_prevalence = model.quantify(sample.instances) estim_prevalence = quantification_func(sample.instances)
return true_prevalence, estim_prevalence return true_prevalence, estim_prevalence
results = Parallel(n_jobs=n_jobs)( results = Parallel(n_jobs=n_jobs)(
delayed(_predict_prevalences)(index) for index in tqdm(indexes) delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting')
) )
true_prevalences, estim_prevalences = zip(*results) true_prevalences, estim_prevalences = zip(*results)

View File

@ -36,6 +36,8 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
def prevalence_from_labels(labels, n_classes): def prevalence_from_labels(labels, n_classes):
if labels.ndim != 1:
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
unique, counts = np.unique(labels, return_counts=True) unique, counts = np.unique(labels, return_counts=True)
by_class = defaultdict(lambda:0, dict(zip(unique, counts))) by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float) prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
@ -44,6 +46,8 @@ def prevalence_from_labels(labels, n_classes):
def prevalence_from_probabilities(posteriors, binarize: bool = False): def prevalence_from_probabilities(posteriors, binarize: bool = False):
if posteriors.ndim != 2:
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
if binarize: if binarize:
predictions = np.argmax(posteriors, axis=-1) predictions = np.argmax(posteriors, axis=-1)
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1]) return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
@ -78,15 +82,15 @@ def normalize_prevalence(prevalences):
def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int): def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1):
""" """
Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant Computes the number of prevalence combinations in the n_classes-dimensional simplex if nprevpoints equally distant
prevalences are generated and nrepeats repetitions are requested prevalences are generated and n_repeats repetitions are requested
:param nclasses: number of classes :param n_classes: number of classes
:param nprevpoints: number of prevalence points. :param n_prevpoints: number of prevalence points.
:param nrepeats: number of repetitions for each prevalence combination :param n_repeats: number of repetitions for each prevalence combination
:return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number :return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the
of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
""" """
__cache={} __cache={}
def __f(nc,np): def __f(nc,np):
@ -98,25 +102,25 @@ def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
x = sum([__f(nc-1, np-i) for i in range(np)]) x = sum([__f(nc-1, np-i) for i in range(np)])
__cache[(nc,np)] = x __cache[(nc,np)] = x
return x return x
return __f(nclasses, nprevpoints) * nrepeats return __f(n_classes, n_prevpoints) * n_repeats
def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget): def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1):
""" """
Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that Searches for the largest number of (equidistant) prevalence points to define for each of the n_classes classes so that
the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional the number of valid prevalences generated as combinations of prevalence points (points in a n_classes-dimensional
simplex) do not exceed combinations_budget. simplex) do not exceed combinations_budget.
:param nclasses: number of classes :param n_classes: number of classes
:param nrepeats: number of repetitions for each prevalence combination :param n_repeats: number of repetitions for each prevalence combination
:param combinations_budget: maximum number of combinatios allowed :param combinations_budget: maximum number of combinatios allowed
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences :return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
""" """
assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers' assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers'
nprevpoints = 1 n_prevpoints = 1
while True: while True:
combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats) combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats)
if combinations > combinations_budget: if combinations > combinations_budget:
return nprevpoints-1 return n_prevpoints-1
else: else:
nprevpoints+=1 n_prevpoints += 1

View File

@ -8,7 +8,7 @@ AGGREGATIVE_METHODS = {
agg.AdjustedClassifyAndCount, agg.AdjustedClassifyAndCount,
agg.ProbabilisticClassifyAndCount, agg.ProbabilisticClassifyAndCount,
agg.ProbabilisticAdjustedClassifyAndCount, agg.ProbabilisticAdjustedClassifyAndCount,
agg.ExplicitLossMinimisation, agg.ExplicitLossMinimisationBinary,
agg.ExpectationMaximizationQuantifier, agg.ExpectationMaximizationQuantifier,
agg.HellingerDistanceY agg.HellingerDistanceY
} }

View File

@ -34,6 +34,13 @@ class AggregativeQuantifier(BaseQuantifier):
def classify(self, instances): def classify(self, instances):
return self.learner.predict(instances) return self.learner.predict(instances)
def quantify(self, instances, *args):
classif_predictions = self.classify(instances)
return self.aggregate(classif_predictions, *args)
@abstractmethod
def aggregate(self, classif_predictions:np.ndarray, *args): ...
def get_params(self, deep=True): def get_params(self, deep=True):
return self.learner.get_params() return self.learner.get_params()
@ -53,13 +60,17 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
""" """
Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities
as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative
Quantifiersimplement by implementing a _soft_classify_ method returning values in [0,1] -- the posterior Quantifiersimplement by implementing a _posterior_probabilities_ method returning values in [0,1] -- the posterior
probabilities. probabilities.
""" """
def soft_classify(self, data): def posterior_probabilities(self, data):
return self.learner.predict_proba(data) return self.learner.predict_proba(data)
def quantify(self, instances, *args):
classif_posteriors = self.posterior_probabilities(instances)
return self.aggregate(classif_posteriors, *args)
def set_params(self, **parameters): def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV): if isinstance(self.learner, CalibratedClassifierCV):
parameters={'base_estimator__'+k:v for k,v in parameters.items()} parameters={'base_estimator__'+k:v for k,v in parameters.items()}
@ -128,9 +139,8 @@ class ClassifyAndCount(AggregativeQuantifier):
self.learner, _ = training_helper(self.learner, data, fit_learner) self.learner, _ = training_helper(self.learner, data, fit_learner)
return self return self
def quantify(self, instances, *args): def aggregate(self, classif_predictions, *args):
classification = self.classify(instances) # classify return F.prevalence_from_labels(classif_predictions, self.n_classes)
return F.prevalence_from_labels(classification, self.n_classes) # & count
class AdjustedClassifyAndCount(AggregativeQuantifier): class AdjustedClassifyAndCount(AggregativeQuantifier):
@ -141,17 +151,24 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split) self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split)
self.cc = ClassifyAndCount(self.learner) self.cc = ClassifyAndCount(self.learner)
y_ = self.cc.classify(validation.instances) y_ = self.classify(validation.instances)
y = validation.labels y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi # document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts() self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
return self return self
def quantify(self, instances, *args): def classify(self, data):
prevs_estim = self.cc.quantify(instances) return self.cc.classify(data)
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
A = self.Pte_cond_estim_ def aggregate(self, classif_predictions, *args):
prevs_estim = self.cc.aggregate(classif_predictions)
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
@classmethod
def solve_adjustment(cls, PteCondEstim, prevs_estim):
# solve for the linear system Ax = B with A=PteCondEstim and B = prevs_estim
A = PteCondEstim
B = prevs_estim B = prevs_estim
try: try:
adjusted_prevs = np.linalg.solve(A, B) adjusted_prevs = np.linalg.solve(A, B)
@ -161,9 +178,6 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
adjusted_prevs = prevs_estim # no way to adjust them! adjusted_prevs = prevs_estim # no way to adjust them!
return adjusted_prevs return adjusted_prevs
def classify(self, data):
return self.cc.classify(data)
class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier): class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
def __init__(self, learner): def __init__(self, learner):
@ -173,13 +187,11 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
return self return self
def quantify(self, instances, *args): def aggregate(self, classif_posteriors, *args):
posteriors = self.soft_classify(instances) # classify return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
return prevalences
class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier): class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
def __init__(self, learner): def __init__(self, learner):
self.learner = learner self.learner = learner
@ -189,28 +201,23 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split
) )
self.pcc = ProbabilisticClassifyAndCount(self.learner) self.pcc = ProbabilisticClassifyAndCount(self.learner)
y_ = self.pcc.classify(validation.instances) y_ = self.classify(validation.instances)
y = validation.labels y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi # document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts() self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
return self return self
def quantify(self, instances, *args): def aggregate(self, classif_posteriors, *args):
prevs_estim = self.pcc.quantify(instances) prevs_estim = self.pcc.aggregate(classif_posteriors)
A = self.Pte_cond_estim_ return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
B = prevs_estim
try:
adjusted_prevs = np.linalg.solve(A, B)
adjusted_prevs = np.clip(adjusted_prevs, 0, 1)
adjusted_prevs /= adjusted_prevs.sum()
except np.linalg.LinAlgError:
adjusted_prevs = prevs_estim # no way to adjust them!
return adjusted_prevs
def classify(self, data): def classify(self, data):
return self.pcc.classify(data) return self.pcc.classify(data)
def soft_classify(self, data):
return self.pcc.posterior_probabilities(data)
class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
@ -226,10 +233,8 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes) self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
return self return self
def quantify(self, X, epsilon=EPSILON): def aggregate(self, classif_posteriors, epsilon=EPSILON):
tr_prev=self.train_prevalence return self.EM(self.train_prevalence, classif_posteriors, self.verbose, epsilon)
posteriors = self.soft_classify(X)
return self.EM(tr_prev, posteriors, self.verbose, epsilon)
@classmethod @classmethod
def EM(cls, tr_prev, posterior_probabilities, verbose=False, epsilon=EPSILON): def EM(cls, tr_prev, posterior_probabilities, verbose=False, epsilon=EPSILON):
@ -277,17 +282,17 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.' f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
self.learner, validation = training_helper( self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
Px = self.soft_classify(validation.instances) Px = self.posterior_probabilities(validation.instances)
self.Pxy1 = Px[validation.labels == 1] self.Pxy1 = Px[validation.labels == 1]
self.Pxy0 = Px[validation.labels == 0] self.Pxy0 = Px[validation.labels == 0]
return self return self
def quantify(self, instances, *args): def aggregate(self, classif_posteriors, *args):
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10, # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
# and the final estimated a priori probability was taken as the median of these 11 estimates." # and the final estimated a priori probability was taken as the median of these 11 estimates."
# (González-Castro, et al., 2013). # (González-Castro, et al., 2013).
Px = self.soft_classify(instances) Px = classif_posteriors
prev_estimations = [] prev_estimations = []
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
@ -318,71 +323,87 @@ class OneVsAll(AggregativeQuantifier):
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1. quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
""" """
def __init__(self, binary_method, n_jobs=-1): def __init__(self, binary_quantifier, n_jobs=-1):
self.binary_method = binary_method self.binary_quantifier = binary_quantifier
self.n_jobs = n_jobs self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, **kwargs): def fit(self, data: LabelledCollection, **kwargs):
assert not data.binary, f'{self.__class__.__name__} expect non-binary data' assert not data.binary, \
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier' f'{self.__class__.__name__} expect non-binary data'
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_} assert isinstance(self.binary_quantifier, BaseQuantifier), \
Parallel(n_jobs=self.n_jobs, backend='threading')( f'{self.binary_quantifier} does not seem to be a Quantifier'
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_ self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
) self.__parallel(self._delayed_binary_fit, data, **kwargs)
return self return self
def classify(self, instances):
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
return classif_predictions_bin.T
def aggregate(self, classif_predictions_bin, *args):
assert set(np.unique(classif_predictions_bin)) == {0,1}, \
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
'predictions for each document (row) and class (columns)'
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
return F.normalize_prevalence(prevalences)
def quantify(self, X, *args): def quantify(self, X, *args):
prevalences = np.asarray( prevalences = self.__parallel(self._delayed_binary_quantify, X)
return F.normalize_prevalence(prevalences)
def __parallel(self, func, *args, **kwargs):
return np.asarray(
Parallel(n_jobs=self.n_jobs, backend='threading')( Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes delayed(func)(c, *args, **kwargs) for c in self.classes
) )
) )
<<<<<<< HEAD
=======
print('one vs all: ', prevalences)
>>>>>>> 2361186a01c53e744f4291e2e2299700216ff139
return F.normalize_prevalence(prevalences)
@property @property
def classes(self): def classes(self):
return sorted(self.class_method.keys()) return sorted(self.dict_binary_quantifiers.keys())
def set_params(self, **parameters): def set_params(self, **parameters):
self.binary_method.set_params(**parameters) self.binary_quantifier.set_params(**parameters)
def get_params(self, deep=True): def get_params(self, deep=True):
return self.binary_method.get_params() return self.binary_quantifier.get_params()
def _delayed_binary_predict(self, c, learners, X): def _delayed_binary_classification(self, c, X):
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence return self.dict_binary_quantifiers[c].classify(X)
def _delayed_binary_fit(self, c, learners, data, **kwargs): def _delayed_binary_quantify(self, c, X):
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence
def _delayed_binary_aggregate(self, c, classif_predictions):
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
def _delayed_binary_fit(self, c, data, **kwargs):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
learners[c].fit(bindata, **kwargs) self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
class ExplicitLossMinimisation(AggregativeQuantifier): # class ExplicitLossMinimisation(AggregativeQuantifier):
""" # """
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary # A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one. # quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. # This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016) # Social Network Analysis and Mining6(19), 122 (2016)
""" # """
#
def __init__(self, svmperf_base, loss, **kwargs): # def __init__(self, svmperf_base, loss, **kwargs):
self.svmperf_base = svmperf_base # self.svmperf_base = svmperf_base
self.loss = loss # self.loss = loss
self.kwargs = kwargs # self.kwargs = kwargs
#
def fit(self, data: LabelledCollection, fit_learner=True, *args): # def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert fit_learner, 'the method requires that fit_learner=True' # assert fit_learner, 'the method requires that fit_learner=True'
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs) # self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
if not data.binary: # if not data.binary:
self.learner = OneVsAll(self.learner, n_jobs=-1) # self.learner = OneVsAll(self.learner, n_jobs=-1)
return self.learner.fit(data, *args) # return self.learner.fit(data, *args)
#
def quantify(self, instances, *args): # def aggregate(self, instances, *args):
return self.learner.quantify(instances, *args) # return self.learner.aggregate(instances, *args)
class ExplicitLossMinimisationBinary(AggregativeQuantifier): class ExplicitLossMinimisationBinary(AggregativeQuantifier):
@ -398,38 +419,35 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels) self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
return self return self
def quantify(self, X, y=None): def aggregate(self, classif_predictions:np.ndarray, *args):
predictions = self.learner.predict(X) return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_)
prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
print('binary: ', prev)
return prev
def classify(self, X, y=None): def classify(self, X, y=None):
return self.learner.predict(X) return self.learner.predict(X)
class SVMQ(ExplicitLossMinimisation): class SVMQ(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
class SVMKLD(ExplicitLossMinimisation): class SVMKLD(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs) super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
class SVMNKLD(ExplicitLossMinimisation): class SVMNKLD(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs) super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
class SVMAE(ExplicitLossMinimisation): class SVMAE(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs) super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
class SVMRAE(ExplicitLossMinimisation): class SVMRAE(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
@ -438,7 +456,7 @@ CC = ClassifyAndCount
ACC = AdjustedClassifyAndCount ACC = AdjustedClassifyAndCount
PCC = ProbabilisticClassifyAndCount PCC = ProbabilisticClassifyAndCount
PACC = ProbabilisticAdjustedClassifyAndCount PACC = ProbabilisticAdjustedClassifyAndCount
ELM = ExplicitLossMinimisation ELM = ExplicitLossMinimisationBinary
EMQ = ExpectationMaximizationQuantifier EMQ = ExpectationMaximizationQuantifier
HDy = HellingerDistanceY HDy = HellingerDistanceY

View File

@ -18,3 +18,48 @@ class BaseQuantifier(metaclass=ABCMeta):
def get_params(self, deep=True): ... def get_params(self, deep=True): ...
# class OneVsAll(AggregativeQuantifier):
# """
# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
# quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
# """
#
# def __init__(self, binary_method, n_jobs=-1):
# self.binary_method = binary_method
# self.n_jobs = n_jobs
#
# def fit(self, data: LabelledCollection, **kwargs):
# assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
# assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
# self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
# Parallel(n_jobs=self.n_jobs, backend='threading')(
# delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
# )
# return self
#
# def quantify(self, X, *args):
# prevalences = np.asarray(
# Parallel(n_jobs=self.n_jobs, backend='threading')(
# delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
# )
# )
# return F.normalize_prevalence(prevalences)
#
# @property
# def classes(self):
# return sorted(self.class_method.keys())
#
# def set_params(self, **parameters):
# self.binary_method.set_params(**parameters)
#
# def get_params(self, deep=True):
# return self.binary_method.get_params()
#
# def _delayed_binary_predict(self, c, learners, X):
# return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence
#
# def _delayed_binary_fit(self, c, learners, data, **kwargs):
# bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
# learners[c].fit(bindata, **kwargs)

View File

@ -3,6 +3,10 @@ import multiprocessing
from joblib import Parallel, delayed from joblib import Parallel, delayed
import contextlib import contextlib
import numpy as np import numpy as np
import urllib
import os
from pathlib import Path
@ -33,3 +37,27 @@ def temp_seed(seed):
finally: finally:
np.random.set_state(state) np.random.set_state(state)
def download_file(url, archive_filename):
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
print("")
def download_file_if_not_exists(url, archive_path):
if os.path.exists(archive_path):
return
create_if_not_exist(os.path.dirname(archive_path))
download_file(url,archive_path)
def create_if_not_exist(path):
os.makedirs(path, exist_ok=True)
def get_quapy_home():
return os.path.join(str(Path.home()), 'quapy_data')

44
test.py
View File

@ -2,37 +2,45 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
import sys
#qp.datasets.fetch_reviews('hp')
#qp.datasets.fetch_twitter('sst')
#sys.exit()
SAMPLE_SIZE=500 SAMPLE_SIZE=500
binary = False binary = False
svmperf_home = './svm_perf_quantification'
if binary: if binary:
# load a textual binary dataset and create a tfidf bag of words dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
train_path = './datasets/reviews/kindle/train.txt'
test_path = './datasets/reviews/kindle/test.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
qp.preprocessing.text2tfidf(dataset, inplace=True)
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
else: else:
# load a sparse matrix ternary dataset dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10)
train_path = './datasets/twitter/train/sst.train+dev.feature.txt' dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
test_path = './datasets/twitter/test/sst.test.feature.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) print('dataset loaded')
# training a quantifier # training a quantifier
learner = LogisticRegression() learner = LogisticRegression()
model = qp.method.aggregative.ClassifyAndCount(learner) # model = qp.method.aggregative.ClassifyAndCount(learner)
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner) # model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) # model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
if not binary:
model = qp.method.aggregative.OneVsAll(model)
print('fitting model')
model.fit(dataset.training) model.fit(dataset.training)
# estimating class prevalences # estimating class prevalences
print('quantifying')
prevalences_estim = model.quantify(dataset.test.instances) prevalences_estim = model.quantify(dataset.test.instances)
prevalences_true = dataset.test.prevalence() prevalences_true = dataset.test.prevalence()
@ -46,9 +54,17 @@ print(f'true prevalence {F.strprev(prevalences_true)}')
print(f'estim prevalence {F.strprev(prevalences_estim)}') print(f'estim prevalence {F.strprev(prevalences_estim)}')
print(f'mae={error:.3f}') print(f'mae={error:.3f}')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE)
qp.error.SAMPLE_SIZE=SAMPLE_SIZE max_evaluations = 5000
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n'
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
qp.error.SAMPLE_SIZE = SAMPLE_SIZE
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)') print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
for error in qp.error.QUANTIFICATION_ERROR: for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev) score = error(true_prev, estim_prev)