This commit is contained in:
Alejandro Moreo Fernandez 2020-12-15 13:39:43 +01:00
commit d6edfe983e
11 changed files with 345 additions and 138 deletions

View File

@ -1,11 +1,8 @@
Documentation with sphinx
Add evaluation - artificial sampling
Add quantification_report (akin to classification_report from sklearn)
Add optimization - artificial sampling
Add prediction - artificial sampling
Add readers for typical datasets used in Quantification
Add NAE, NRAE
Add "measures for evaluating ordinal"?
Document methods with paper references
The parallel training in svmperf seems not to work
The parallel training in svmperf seems not to work (not sure...)

View File

@ -1,5 +1,6 @@
from .base import *
from .reader import *
from . import preprocessing
from . import datasets

83
quapy/data/datasets.py Normal file
View File

@ -0,0 +1,83 @@
import zipfile
from utils.util import download_file_if_not_exists, download_file, get_quapy_home
import os
from os.path import join
from data.base import Dataset, LabelledCollection
from data.reader import from_text, from_sparse
from data.preprocessing import text2tfidf, reduce_columns
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
'sst', 'wa', 'wb']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
download_file_if_not_exists(URL_TRAIN, train_path)
download_file_if_not_exists(URL_TEST, test_path)
data = Dataset.load(train_path, test_path, from_text)
if tfidf:
text2tfidf(data, inplace=True)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
return data
def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
if not os.path.exists(unzipped_path):
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
download_file(URL, downloaded_path)
with zipfile.ZipFile(downloaded_path) as file:
file.extractall(data_home)
os.remove(downloaded_path)
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
trainset_name = 'semeval'
testset_name = 'semeval' if model_selection else dataset_name
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
else:
trainset_name = testset_name = dataset_name
if model_selection:
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
else:
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
if dataset_name == 'semeval16':
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
else:
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
data = Dataset.load(train, test, from_sparse)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
return data

View File

@ -54,3 +54,4 @@ def from_sparse(path):
X = X.tocsr()
y = np.asarray(all_labels) + 1
return X, y

View File

@ -1,4 +1,5 @@
from data import LabelledCollection
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier
from method.base import BaseQuantifier
from utils.util import temp_seed
import numpy as np
@ -10,8 +11,8 @@ def artificial_sampling_prediction(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
prevalence_points=21,
point_repetitions=1,
n_prevpoints=210,
n_repetitions=1,
n_jobs=-1,
random_seed=42):
"""
@ -19,27 +20,40 @@ def artificial_sampling_prediction(
:param model: the model in charge of generating the class prevalence estimations
:param test: the test set on which to perform arificial sampling
:param sample_size: the size of the samples
:param prevalence_points: the number of different prevalences to sample
:param point_repetitions: the number of repetitions for each prevalence
:param n_prevpoints: the number of different prevalences to sample
:param n_repetitions: the number of repetitions for each prevalence
:param n_jobs: number of jobs to be run in parallel
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
any other random process.
:return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
number of classes. The first one contains the true prevalences for the samples generated while the second one
containing the the prevalences estimations
"""
with temp_seed(random_seed):
indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions))
indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
if isinstance(model, AggregativeQuantifier):
quantification_func = model.aggregate
if isinstance(model, AggregativeProbabilisticQuantifier):
print('\tpreclassifying with soft')
preclassified_instances = model.posterior_probabilities(test.instances)
else:
print('\tpreclassifying with hard')
preclassified_instances = model.classify(test.instances)
test = LabelledCollection(preclassified_instances, test.labels)
else:
quantification_func = model.quantify
print('not an aggregative')
def _predict_prevalences(index):
sample = test.sampling_from_index(index)
true_prevalence = sample.prevalence()
estim_prevalence = model.quantify(sample.instances)
estim_prevalence = quantification_func(sample.instances)
return true_prevalence, estim_prevalence
results = Parallel(n_jobs=n_jobs)(
delayed(_predict_prevalences)(index) for index in tqdm(indexes)
delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting')
)
true_prevalences, estim_prevalences = zip(*results)

View File

@ -36,6 +36,8 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
def prevalence_from_labels(labels, n_classes):
if labels.ndim != 1:
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
unique, counts = np.unique(labels, return_counts=True)
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
@ -44,6 +46,8 @@ def prevalence_from_labels(labels, n_classes):
def prevalence_from_probabilities(posteriors, binarize: bool = False):
if posteriors.ndim != 2:
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
if binarize:
predictions = np.argmax(posteriors, axis=-1)
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
@ -78,15 +82,15 @@ def normalize_prevalence(prevalences):
def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1):
"""
Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant
prevalences are generated and nrepeats repetitions are requested
:param nclasses: number of classes
:param nprevpoints: number of prevalence points.
:param nrepeats: number of repetitions for each prevalence combination
:return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number
of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
Computes the number of prevalence combinations in the n_classes-dimensional simplex if nprevpoints equally distant
prevalences are generated and n_repeats repetitions are requested
:param n_classes: number of classes
:param n_prevpoints: number of prevalence points.
:param n_repeats: number of repetitions for each prevalence combination
:return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the
number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
"""
__cache={}
def __f(nc,np):
@ -98,25 +102,25 @@ def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
x = sum([__f(nc-1, np-i) for i in range(np)])
__cache[(nc,np)] = x
return x
return __f(nclasses, nprevpoints) * nrepeats
return __f(n_classes, n_prevpoints) * n_repeats
def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget):
def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1):
"""
Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that
the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional
Searches for the largest number of (equidistant) prevalence points to define for each of the n_classes classes so that
the number of valid prevalences generated as combinations of prevalence points (points in a n_classes-dimensional
simplex) do not exceed combinations_budget.
:param nclasses: number of classes
:param nrepeats: number of repetitions for each prevalence combination
:param n_classes: number of classes
:param n_repeats: number of repetitions for each prevalence combination
:param combinations_budget: maximum number of combinatios allowed
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
"""
assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers'
nprevpoints = 1
assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers'
n_prevpoints = 1
while True:
combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats)
combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats)
if combinations > combinations_budget:
return nprevpoints-1
return n_prevpoints-1
else:
nprevpoints+=1
n_prevpoints += 1

View File

@ -8,7 +8,7 @@ AGGREGATIVE_METHODS = {
agg.AdjustedClassifyAndCount,
agg.ProbabilisticClassifyAndCount,
agg.ProbabilisticAdjustedClassifyAndCount,
agg.ExplicitLossMinimisation,
agg.ExplicitLossMinimisationBinary,
agg.ExpectationMaximizationQuantifier,
agg.HellingerDistanceY
}

View File

@ -34,6 +34,13 @@ class AggregativeQuantifier(BaseQuantifier):
def classify(self, instances):
return self.learner.predict(instances)
def quantify(self, instances, *args):
classif_predictions = self.classify(instances)
return self.aggregate(classif_predictions, *args)
@abstractmethod
def aggregate(self, classif_predictions:np.ndarray, *args): ...
def get_params(self, deep=True):
return self.learner.get_params()
@ -53,13 +60,17 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
"""
Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities
as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative
Quantifiersimplement by implementing a _soft_classify_ method returning values in [0,1] -- the posterior
Quantifiersimplement by implementing a _posterior_probabilities_ method returning values in [0,1] -- the posterior
probabilities.
"""
def soft_classify(self, data):
def posterior_probabilities(self, data):
return self.learner.predict_proba(data)
def quantify(self, instances, *args):
classif_posteriors = self.posterior_probabilities(instances)
return self.aggregate(classif_posteriors, *args)
def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV):
parameters={'base_estimator__'+k:v for k,v in parameters.items()}
@ -128,9 +139,8 @@ class ClassifyAndCount(AggregativeQuantifier):
self.learner, _ = training_helper(self.learner, data, fit_learner)
return self
def quantify(self, instances, *args):
classification = self.classify(instances) # classify
return F.prevalence_from_labels(classification, self.n_classes) # & count
def aggregate(self, classif_predictions, *args):
return F.prevalence_from_labels(classif_predictions, self.n_classes)
class AdjustedClassifyAndCount(AggregativeQuantifier):
@ -141,17 +151,24 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split)
self.cc = ClassifyAndCount(self.learner)
y_ = self.cc.classify(validation.instances)
y_ = self.classify(validation.instances)
y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
return self
def quantify(self, instances, *args):
prevs_estim = self.cc.quantify(instances)
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
A = self.Pte_cond_estim_
def classify(self, data):
return self.cc.classify(data)
def aggregate(self, classif_predictions, *args):
prevs_estim = self.cc.aggregate(classif_predictions)
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
@classmethod
def solve_adjustment(cls, PteCondEstim, prevs_estim):
# solve for the linear system Ax = B with A=PteCondEstim and B = prevs_estim
A = PteCondEstim
B = prevs_estim
try:
adjusted_prevs = np.linalg.solve(A, B)
@ -161,9 +178,6 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
adjusted_prevs = prevs_estim # no way to adjust them!
return adjusted_prevs
def classify(self, data):
return self.cc.classify(data)
class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
def __init__(self, learner):
@ -173,13 +187,11 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
return self
def quantify(self, instances, *args):
posteriors = self.soft_classify(instances) # classify
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
return prevalences
def aggregate(self, classif_posteriors, *args):
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
def __init__(self, learner):
self.learner = learner
@ -189,28 +201,23 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split
)
self.pcc = ProbabilisticClassifyAndCount(self.learner)
y_ = self.pcc.classify(validation.instances)
y_ = self.classify(validation.instances)
y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
return self
def quantify(self, instances, *args):
prevs_estim = self.pcc.quantify(instances)
A = self.Pte_cond_estim_
B = prevs_estim
try:
adjusted_prevs = np.linalg.solve(A, B)
adjusted_prevs = np.clip(adjusted_prevs, 0, 1)
adjusted_prevs /= adjusted_prevs.sum()
except np.linalg.LinAlgError:
adjusted_prevs = prevs_estim # no way to adjust them!
return adjusted_prevs
def aggregate(self, classif_posteriors, *args):
prevs_estim = self.pcc.aggregate(classif_posteriors)
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
def classify(self, data):
return self.pcc.classify(data)
def soft_classify(self, data):
return self.pcc.posterior_probabilities(data)
class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
@ -226,10 +233,8 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
return self
def quantify(self, X, epsilon=EPSILON):
tr_prev=self.train_prevalence
posteriors = self.soft_classify(X)
return self.EM(tr_prev, posteriors, self.verbose, epsilon)
def aggregate(self, classif_posteriors, epsilon=EPSILON):
return self.EM(self.train_prevalence, classif_posteriors, self.verbose, epsilon)
@classmethod
def EM(cls, tr_prev, posterior_probabilities, verbose=False, epsilon=EPSILON):
@ -277,17 +282,17 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
Px = self.soft_classify(validation.instances)
Px = self.posterior_probabilities(validation.instances)
self.Pxy1 = Px[validation.labels == 1]
self.Pxy0 = Px[validation.labels == 0]
return self
def quantify(self, instances, *args):
def aggregate(self, classif_posteriors, *args):
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
# and the final estimated a priori probability was taken as the median of these 11 estimates."
# (González-Castro, et al., 2013).
Px = self.soft_classify(instances)
Px = classif_posteriors
prev_estimations = []
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
@ -318,71 +323,87 @@ class OneVsAll(AggregativeQuantifier):
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
"""
def __init__(self, binary_method, n_jobs=-1):
self.binary_method = binary_method
def __init__(self, binary_quantifier, n_jobs=-1):
self.binary_quantifier = binary_quantifier
self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, **kwargs):
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
)
assert not data.binary, \
f'{self.__class__.__name__} expect non-binary data'
assert isinstance(self.binary_quantifier, BaseQuantifier), \
f'{self.binary_quantifier} does not seem to be a Quantifier'
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data, **kwargs)
return self
def classify(self, instances):
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
return classif_predictions_bin.T
def aggregate(self, classif_predictions_bin, *args):
assert set(np.unique(classif_predictions_bin)) == {0,1}, \
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
'predictions for each document (row) and class (columns)'
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
return F.normalize_prevalence(prevalences)
def quantify(self, X, *args):
prevalences = np.asarray(
prevalences = self.__parallel(self._delayed_binary_quantify, X)
return F.normalize_prevalence(prevalences)
def __parallel(self, func, *args, **kwargs):
return np.asarray(
Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
delayed(func)(c, *args, **kwargs) for c in self.classes
)
)
<<<<<<< HEAD
=======
print('one vs all: ', prevalences)
>>>>>>> 2361186a01c53e744f4291e2e2299700216ff139
return F.normalize_prevalence(prevalences)
@property
def classes(self):
return sorted(self.class_method.keys())
return sorted(self.dict_binary_quantifiers.keys())
def set_params(self, **parameters):
self.binary_method.set_params(**parameters)
self.binary_quantifier.set_params(**parameters)
def get_params(self, deep=True):
return self.binary_method.get_params()
return self.binary_quantifier.get_params()
def _delayed_binary_predict(self, c, learners, X):
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
def _delayed_binary_classification(self, c, X):
return self.dict_binary_quantifiers[c].classify(X)
def _delayed_binary_fit(self, c, learners, data, **kwargs):
def _delayed_binary_quantify(self, c, X):
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence
def _delayed_binary_aggregate(self, c, classif_predictions):
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
def _delayed_binary_fit(self, c, data, **kwargs):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
learners[c].fit(bindata, **kwargs)
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
class ExplicitLossMinimisation(AggregativeQuantifier):
"""
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
"""
def __init__(self, svmperf_base, loss, **kwargs):
self.svmperf_base = svmperf_base
self.loss = loss
self.kwargs = kwargs
def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert fit_learner, 'the method requires that fit_learner=True'
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
if not data.binary:
self.learner = OneVsAll(self.learner, n_jobs=-1)
return self.learner.fit(data, *args)
def quantify(self, instances, *args):
return self.learner.quantify(instances, *args)
# class ExplicitLossMinimisation(AggregativeQuantifier):
# """
# A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
# quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
# This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
# Social Network Analysis and Mining6(19), 122 (2016)
# """
#
# def __init__(self, svmperf_base, loss, **kwargs):
# self.svmperf_base = svmperf_base
# self.loss = loss
# self.kwargs = kwargs
#
# def fit(self, data: LabelledCollection, fit_learner=True, *args):
# assert fit_learner, 'the method requires that fit_learner=True'
# self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
# if not data.binary:
# self.learner = OneVsAll(self.learner, n_jobs=-1)
# return self.learner.fit(data, *args)
#
# def aggregate(self, instances, *args):
# return self.learner.aggregate(instances, *args)
class ExplicitLossMinimisationBinary(AggregativeQuantifier):
@ -398,38 +419,35 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
return self
def quantify(self, X, y=None):
predictions = self.learner.predict(X)
prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
print('binary: ', prev)
return prev
def aggregate(self, classif_predictions:np.ndarray, *args):
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_)
def classify(self, X, y=None):
return self.learner.predict(X)
class SVMQ(ExplicitLossMinimisation):
class SVMQ(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs):
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
class SVMKLD(ExplicitLossMinimisation):
class SVMKLD(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs):
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
class SVMNKLD(ExplicitLossMinimisation):
class SVMNKLD(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs):
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
class SVMAE(ExplicitLossMinimisation):
class SVMAE(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs):
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
class SVMRAE(ExplicitLossMinimisation):
class SVMRAE(ExplicitLossMinimisationBinary):
def __init__(self, svmperf_base, **kwargs):
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
@ -438,7 +456,7 @@ CC = ClassifyAndCount
ACC = AdjustedClassifyAndCount
PCC = ProbabilisticClassifyAndCount
PACC = ProbabilisticAdjustedClassifyAndCount
ELM = ExplicitLossMinimisation
ELM = ExplicitLossMinimisationBinary
EMQ = ExpectationMaximizationQuantifier
HDy = HellingerDistanceY

View File

@ -18,3 +18,48 @@ class BaseQuantifier(metaclass=ABCMeta):
def get_params(self, deep=True): ...
# class OneVsAll(AggregativeQuantifier):
# """
# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
# quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
# """
#
# def __init__(self, binary_method, n_jobs=-1):
# self.binary_method = binary_method
# self.n_jobs = n_jobs
#
# def fit(self, data: LabelledCollection, **kwargs):
# assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
# assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
# self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
# Parallel(n_jobs=self.n_jobs, backend='threading')(
# delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
# )
# return self
#
# def quantify(self, X, *args):
# prevalences = np.asarray(
# Parallel(n_jobs=self.n_jobs, backend='threading')(
# delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
# )
# )
# return F.normalize_prevalence(prevalences)
#
# @property
# def classes(self):
# return sorted(self.class_method.keys())
#
# def set_params(self, **parameters):
# self.binary_method.set_params(**parameters)
#
# def get_params(self, deep=True):
# return self.binary_method.get_params()
#
# def _delayed_binary_predict(self, c, learners, X):
# return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence
#
# def _delayed_binary_fit(self, c, learners, data, **kwargs):
# bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
# learners[c].fit(bindata, **kwargs)

View File

@ -3,6 +3,10 @@ import multiprocessing
from joblib import Parallel, delayed
import contextlib
import numpy as np
import urllib
import os
from pathlib import Path
@ -33,3 +37,27 @@ def temp_seed(seed):
finally:
np.random.set_state(state)
def download_file(url, archive_filename):
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
print("")
def download_file_if_not_exists(url, archive_path):
if os.path.exists(archive_path):
return
create_if_not_exist(os.path.dirname(archive_path))
download_file(url,archive_path)
def create_if_not_exist(path):
os.makedirs(path, exist_ok=True)
def get_quapy_home():
return os.path.join(str(Path.home()), 'quapy_data')

44
test.py
View File

@ -2,37 +2,45 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
import sys
#qp.datasets.fetch_reviews('hp')
#qp.datasets.fetch_twitter('sst')
#sys.exit()
SAMPLE_SIZE=500
binary = False
svmperf_home = './svm_perf_quantification'
if binary:
# load a textual binary dataset and create a tfidf bag of words
train_path = './datasets/reviews/kindle/train.txt'
test_path = './datasets/reviews/kindle/test.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
qp.preprocessing.text2tfidf(dataset, inplace=True)
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
else:
# load a sparse matrix ternary dataset
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
test_path = './datasets/twitter/test/sst.test.feature.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10)
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
print('dataset loaded')
# training a quantifier
learner = LogisticRegression()
model = qp.method.aggregative.ClassifyAndCount(learner)
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ClassifyAndCount(learner)
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
if not binary:
model = qp.method.aggregative.OneVsAll(model)
print('fitting model')
model.fit(dataset.training)
# estimating class prevalences
print('quantifying')
prevalences_estim = model.quantify(dataset.test.instances)
prevalences_true = dataset.test.prevalence()
@ -46,9 +54,17 @@ print(f'true prevalence {F.strprev(prevalences_true)}')
print(f'estim prevalence {F.strprev(prevalences_estim)}')
print(f'mae={error:.3f}')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE)
qp.error.SAMPLE_SIZE=SAMPLE_SIZE
max_evaluations = 5000
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n'
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
qp.error.SAMPLE_SIZE = SAMPLE_SIZE
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev)