forked from moreo/QuaPy
Merge branch 'master' of https://gitea-s2i2s.isti.cnr.it/moreo/QuaPy
This commit is contained in:
commit
d6edfe983e
5
TODO.txt
5
TODO.txt
|
@ -1,11 +1,8 @@
|
|||
Documentation with sphinx
|
||||
Add evaluation - artificial sampling
|
||||
Add quantification_report (akin to classification_report from sklearn)
|
||||
Add optimization - artificial sampling
|
||||
Add prediction - artificial sampling
|
||||
Add readers for typical datasets used in Quantification
|
||||
Add NAE, NRAE
|
||||
Add "measures for evaluating ordinal"?
|
||||
Document methods with paper references
|
||||
The parallel training in svmperf seems not to work
|
||||
The parallel training in svmperf seems not to work (not sure...)
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from .base import *
|
||||
from .reader import *
|
||||
from . import preprocessing
|
||||
from . import datasets
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
import zipfile
|
||||
from utils.util import download_file_if_not_exists, download_file, get_quapy_home
|
||||
import os
|
||||
from os.path import join
|
||||
from data.base import Dataset, LabelledCollection
|
||||
from data.reader import from_text, from_sparse
|
||||
from data.preprocessing import text2tfidf, reduce_columns
|
||||
|
||||
|
||||
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
|
||||
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
|
||||
'sst', 'wa', 'wb']
|
||||
|
||||
|
||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
|
||||
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
|
||||
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
|
||||
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
|
||||
if data_home is None:
|
||||
data_home = get_quapy_home()
|
||||
|
||||
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
|
||||
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
|
||||
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
|
||||
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
|
||||
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
|
||||
download_file_if_not_exists(URL_TRAIN, train_path)
|
||||
download_file_if_not_exists(URL_TEST, test_path)
|
||||
|
||||
data = Dataset.load(train_path, test_path, from_text)
|
||||
|
||||
if tfidf:
|
||||
text2tfidf(data, inplace=True)
|
||||
|
||||
if min_df is not None:
|
||||
reduce_columns(data, min_df=min_df, inplace=True)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
|
||||
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
|
||||
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
|
||||
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
|
||||
if data_home is None:
|
||||
data_home = get_quapy_home()
|
||||
|
||||
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
|
||||
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
|
||||
if not os.path.exists(unzipped_path):
|
||||
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
|
||||
download_file(URL, downloaded_path)
|
||||
with zipfile.ZipFile(downloaded_path) as file:
|
||||
file.extractall(data_home)
|
||||
os.remove(downloaded_path)
|
||||
|
||||
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
|
||||
trainset_name = 'semeval'
|
||||
testset_name = 'semeval' if model_selection else dataset_name
|
||||
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
|
||||
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
|
||||
else:
|
||||
trainset_name = testset_name = dataset_name
|
||||
|
||||
if model_selection:
|
||||
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
|
||||
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
|
||||
else:
|
||||
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
|
||||
if dataset_name == 'semeval16':
|
||||
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
|
||||
else:
|
||||
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
|
||||
|
||||
data = Dataset.load(train, test, from_sparse)
|
||||
|
||||
if min_df is not None:
|
||||
reduce_columns(data, min_df=min_df, inplace=True)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
|
@ -54,3 +54,4 @@ def from_sparse(path):
|
|||
X = X.tocsr()
|
||||
y = np.asarray(all_labels) + 1
|
||||
return X, y
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from data import LabelledCollection
|
||||
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier
|
||||
from method.base import BaseQuantifier
|
||||
from utils.util import temp_seed
|
||||
import numpy as np
|
||||
|
@ -10,8 +11,8 @@ def artificial_sampling_prediction(
|
|||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
sample_size,
|
||||
prevalence_points=21,
|
||||
point_repetitions=1,
|
||||
n_prevpoints=210,
|
||||
n_repetitions=1,
|
||||
n_jobs=-1,
|
||||
random_seed=42):
|
||||
"""
|
||||
|
@ -19,27 +20,40 @@ def artificial_sampling_prediction(
|
|||
:param model: the model in charge of generating the class prevalence estimations
|
||||
:param test: the test set on which to perform arificial sampling
|
||||
:param sample_size: the size of the samples
|
||||
:param prevalence_points: the number of different prevalences to sample
|
||||
:param point_repetitions: the number of repetitions for each prevalence
|
||||
:param n_prevpoints: the number of different prevalences to sample
|
||||
:param n_repetitions: the number of repetitions for each prevalence
|
||||
:param n_jobs: number of jobs to be run in parallel
|
||||
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
||||
any other random process.
|
||||
:return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the
|
||||
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
|
||||
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
||||
containing the the prevalences estimations
|
||||
"""
|
||||
|
||||
with temp_seed(random_seed):
|
||||
indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions))
|
||||
indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
|
||||
|
||||
if isinstance(model, AggregativeQuantifier):
|
||||
quantification_func = model.aggregate
|
||||
if isinstance(model, AggregativeProbabilisticQuantifier):
|
||||
print('\tpreclassifying with soft')
|
||||
preclassified_instances = model.posterior_probabilities(test.instances)
|
||||
else:
|
||||
print('\tpreclassifying with hard')
|
||||
preclassified_instances = model.classify(test.instances)
|
||||
test = LabelledCollection(preclassified_instances, test.labels)
|
||||
else:
|
||||
quantification_func = model.quantify
|
||||
print('not an aggregative')
|
||||
|
||||
def _predict_prevalences(index):
|
||||
sample = test.sampling_from_index(index)
|
||||
true_prevalence = sample.prevalence()
|
||||
estim_prevalence = model.quantify(sample.instances)
|
||||
estim_prevalence = quantification_func(sample.instances)
|
||||
return true_prevalence, estim_prevalence
|
||||
|
||||
results = Parallel(n_jobs=n_jobs)(
|
||||
delayed(_predict_prevalences)(index) for index in tqdm(indexes)
|
||||
delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting')
|
||||
)
|
||||
|
||||
true_prevalences, estim_prevalences = zip(*results)
|
||||
|
|
|
@ -36,6 +36,8 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
|||
|
||||
|
||||
def prevalence_from_labels(labels, n_classes):
|
||||
if labels.ndim != 1:
|
||||
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
||||
unique, counts = np.unique(labels, return_counts=True)
|
||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
|
||||
|
@ -44,6 +46,8 @@ def prevalence_from_labels(labels, n_classes):
|
|||
|
||||
|
||||
def prevalence_from_probabilities(posteriors, binarize: bool = False):
|
||||
if posteriors.ndim != 2:
|
||||
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
|
||||
if binarize:
|
||||
predictions = np.argmax(posteriors, axis=-1)
|
||||
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
|
||||
|
@ -78,15 +82,15 @@ def normalize_prevalence(prevalences):
|
|||
|
||||
|
||||
|
||||
def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
|
||||
def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1):
|
||||
"""
|
||||
Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant
|
||||
prevalences are generated and nrepeats repetitions are requested
|
||||
:param nclasses: number of classes
|
||||
:param nprevpoints: number of prevalence points.
|
||||
:param nrepeats: number of repetitions for each prevalence combination
|
||||
:return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number
|
||||
of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
|
||||
Computes the number of prevalence combinations in the n_classes-dimensional simplex if nprevpoints equally distant
|
||||
prevalences are generated and n_repeats repetitions are requested
|
||||
:param n_classes: number of classes
|
||||
:param n_prevpoints: number of prevalence points.
|
||||
:param n_repeats: number of repetitions for each prevalence combination
|
||||
:return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the
|
||||
number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
|
||||
"""
|
||||
__cache={}
|
||||
def __f(nc,np):
|
||||
|
@ -98,25 +102,25 @@ def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
|
|||
x = sum([__f(nc-1, np-i) for i in range(np)])
|
||||
__cache[(nc,np)] = x
|
||||
return x
|
||||
return __f(nclasses, nprevpoints) * nrepeats
|
||||
return __f(n_classes, n_prevpoints) * n_repeats
|
||||
|
||||
|
||||
def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget):
|
||||
def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1):
|
||||
"""
|
||||
Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that
|
||||
the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional
|
||||
Searches for the largest number of (equidistant) prevalence points to define for each of the n_classes classes so that
|
||||
the number of valid prevalences generated as combinations of prevalence points (points in a n_classes-dimensional
|
||||
simplex) do not exceed combinations_budget.
|
||||
:param nclasses: number of classes
|
||||
:param nrepeats: number of repetitions for each prevalence combination
|
||||
:param n_classes: number of classes
|
||||
:param n_repeats: number of repetitions for each prevalence combination
|
||||
:param combinations_budget: maximum number of combinatios allowed
|
||||
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
|
||||
"""
|
||||
assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers'
|
||||
nprevpoints = 1
|
||||
assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers'
|
||||
n_prevpoints = 1
|
||||
while True:
|
||||
combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats)
|
||||
combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats)
|
||||
if combinations > combinations_budget:
|
||||
return nprevpoints-1
|
||||
return n_prevpoints-1
|
||||
else:
|
||||
nprevpoints+=1
|
||||
n_prevpoints += 1
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ AGGREGATIVE_METHODS = {
|
|||
agg.AdjustedClassifyAndCount,
|
||||
agg.ProbabilisticClassifyAndCount,
|
||||
agg.ProbabilisticAdjustedClassifyAndCount,
|
||||
agg.ExplicitLossMinimisation,
|
||||
agg.ExplicitLossMinimisationBinary,
|
||||
agg.ExpectationMaximizationQuantifier,
|
||||
agg.HellingerDistanceY
|
||||
}
|
||||
|
|
|
@ -34,6 +34,13 @@ class AggregativeQuantifier(BaseQuantifier):
|
|||
def classify(self, instances):
|
||||
return self.learner.predict(instances)
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
classif_predictions = self.classify(instances)
|
||||
return self.aggregate(classif_predictions, *args)
|
||||
|
||||
@abstractmethod
|
||||
def aggregate(self, classif_predictions:np.ndarray, *args): ...
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.learner.get_params()
|
||||
|
||||
|
@ -53,13 +60,17 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
|||
"""
|
||||
Abstract class for quantification methods that base their estimations on the aggregation of posterior probabilities
|
||||
as returned by a probabilistic classifier. Aggregative Probabilistic Quantifiers thus extend Aggregative
|
||||
Quantifiersimplement by implementing a _soft_classify_ method returning values in [0,1] -- the posterior
|
||||
Quantifiersimplement by implementing a _posterior_probabilities_ method returning values in [0,1] -- the posterior
|
||||
probabilities.
|
||||
"""
|
||||
|
||||
def soft_classify(self, data):
|
||||
def posterior_probabilities(self, data):
|
||||
return self.learner.predict_proba(data)
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
classif_posteriors = self.posterior_probabilities(instances)
|
||||
return self.aggregate(classif_posteriors, *args)
|
||||
|
||||
def set_params(self, **parameters):
|
||||
if isinstance(self.learner, CalibratedClassifierCV):
|
||||
parameters={'base_estimator__'+k:v for k,v in parameters.items()}
|
||||
|
@ -128,9 +139,8 @@ class ClassifyAndCount(AggregativeQuantifier):
|
|||
self.learner, _ = training_helper(self.learner, data, fit_learner)
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
classification = self.classify(instances) # classify
|
||||
return F.prevalence_from_labels(classification, self.n_classes) # & count
|
||||
def aggregate(self, classif_predictions, *args):
|
||||
return F.prevalence_from_labels(classif_predictions, self.n_classes)
|
||||
|
||||
|
||||
class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||
|
@ -141,17 +151,24 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
|||
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
||||
self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split)
|
||||
self.cc = ClassifyAndCount(self.learner)
|
||||
y_ = self.cc.classify(validation.instances)
|
||||
y_ = self.classify(validation.instances)
|
||||
y = validation.labels
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
prevs_estim = self.cc.quantify(instances)
|
||||
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
|
||||
A = self.Pte_cond_estim_
|
||||
def classify(self, data):
|
||||
return self.cc.classify(data)
|
||||
|
||||
def aggregate(self, classif_predictions, *args):
|
||||
prevs_estim = self.cc.aggregate(classif_predictions)
|
||||
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
||||
|
||||
@classmethod
|
||||
def solve_adjustment(cls, PteCondEstim, prevs_estim):
|
||||
# solve for the linear system Ax = B with A=PteCondEstim and B = prevs_estim
|
||||
A = PteCondEstim
|
||||
B = prevs_estim
|
||||
try:
|
||||
adjusted_prevs = np.linalg.solve(A, B)
|
||||
|
@ -161,9 +178,6 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
|||
adjusted_prevs = prevs_estim # no way to adjust them!
|
||||
return adjusted_prevs
|
||||
|
||||
def classify(self, data):
|
||||
return self.cc.classify(data)
|
||||
|
||||
|
||||
class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||
def __init__(self, learner):
|
||||
|
@ -173,13 +187,11 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
|||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
posteriors = self.soft_classify(instances) # classify
|
||||
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
|
||||
return prevalences
|
||||
def aggregate(self, classif_posteriors, *args):
|
||||
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
|
||||
|
||||
|
||||
class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
|
||||
class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||
|
||||
def __init__(self, learner):
|
||||
self.learner = learner
|
||||
|
@ -189,28 +201,23 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
|
|||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split
|
||||
)
|
||||
self.pcc = ProbabilisticClassifyAndCount(self.learner)
|
||||
y_ = self.pcc.classify(validation.instances)
|
||||
y_ = self.classify(validation.instances)
|
||||
y = validation.labels
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
prevs_estim = self.pcc.quantify(instances)
|
||||
A = self.Pte_cond_estim_
|
||||
B = prevs_estim
|
||||
try:
|
||||
adjusted_prevs = np.linalg.solve(A, B)
|
||||
adjusted_prevs = np.clip(adjusted_prevs, 0, 1)
|
||||
adjusted_prevs /= adjusted_prevs.sum()
|
||||
except np.linalg.LinAlgError:
|
||||
adjusted_prevs = prevs_estim # no way to adjust them!
|
||||
return adjusted_prevs
|
||||
def aggregate(self, classif_posteriors, *args):
|
||||
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
||||
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
||||
|
||||
def classify(self, data):
|
||||
return self.pcc.classify(data)
|
||||
|
||||
def soft_classify(self, data):
|
||||
return self.pcc.posterior_probabilities(data)
|
||||
|
||||
|
||||
class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||
|
||||
|
@ -226,10 +233,8 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
|||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
|
||||
return self
|
||||
|
||||
def quantify(self, X, epsilon=EPSILON):
|
||||
tr_prev=self.train_prevalence
|
||||
posteriors = self.soft_classify(X)
|
||||
return self.EM(tr_prev, posteriors, self.verbose, epsilon)
|
||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||
return self.EM(self.train_prevalence, classif_posteriors, self.verbose, epsilon)
|
||||
|
||||
@classmethod
|
||||
def EM(cls, tr_prev, posterior_probabilities, verbose=False, epsilon=EPSILON):
|
||||
|
@ -277,17 +282,17 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
|||
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
||||
self.learner, validation = training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
||||
Px = self.soft_classify(validation.instances)
|
||||
Px = self.posterior_probabilities(validation.instances)
|
||||
self.Pxy1 = Px[validation.labels == 1]
|
||||
self.Pxy0 = Px[validation.labels == 0]
|
||||
return self
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
def aggregate(self, classif_posteriors, *args):
|
||||
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
|
||||
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
||||
# (González-Castro, et al., 2013).
|
||||
|
||||
Px = self.soft_classify(instances)
|
||||
Px = classif_posteriors
|
||||
|
||||
prev_estimations = []
|
||||
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
||||
|
@ -318,71 +323,87 @@ class OneVsAll(AggregativeQuantifier):
|
|||
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||
"""
|
||||
|
||||
def __init__(self, binary_method, n_jobs=-1):
|
||||
self.binary_method = binary_method
|
||||
def __init__(self, binary_quantifier, n_jobs=-1):
|
||||
self.binary_quantifier = binary_quantifier
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, data: LabelledCollection, **kwargs):
|
||||
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
||||
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
||||
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
||||
)
|
||||
assert not data.binary, \
|
||||
f'{self.__class__.__name__} expect non-binary data'
|
||||
assert isinstance(self.binary_quantifier, BaseQuantifier), \
|
||||
f'{self.binary_quantifier} does not seem to be a Quantifier'
|
||||
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
|
||||
self.__parallel(self._delayed_binary_fit, data, **kwargs)
|
||||
return self
|
||||
|
||||
def classify(self, instances):
|
||||
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
|
||||
return classif_predictions_bin.T
|
||||
|
||||
def aggregate(self, classif_predictions_bin, *args):
|
||||
assert set(np.unique(classif_predictions_bin)) == {0,1}, \
|
||||
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
|
||||
'predictions for each document (row) and class (columns)'
|
||||
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
|
||||
return F.normalize_prevalence(prevalences)
|
||||
|
||||
def quantify(self, X, *args):
|
||||
prevalences = np.asarray(
|
||||
prevalences = self.__parallel(self._delayed_binary_quantify, X)
|
||||
return F.normalize_prevalence(prevalences)
|
||||
|
||||
def __parallel(self, func, *args, **kwargs):
|
||||
return np.asarray(
|
||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
|
||||
delayed(func)(c, *args, **kwargs) for c in self.classes
|
||||
)
|
||||
)
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
print('one vs all: ', prevalences)
|
||||
>>>>>>> 2361186a01c53e744f4291e2e2299700216ff139
|
||||
return F.normalize_prevalence(prevalences)
|
||||
|
||||
@property
|
||||
def classes(self):
|
||||
return sorted(self.class_method.keys())
|
||||
return sorted(self.dict_binary_quantifiers.keys())
|
||||
|
||||
def set_params(self, **parameters):
|
||||
self.binary_method.set_params(**parameters)
|
||||
self.binary_quantifier.set_params(**parameters)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.binary_method.get_params()
|
||||
return self.binary_quantifier.get_params()
|
||||
|
||||
def _delayed_binary_predict(self, c, learners, X):
|
||||
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
|
||||
def _delayed_binary_classification(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].classify(X)
|
||||
|
||||
def _delayed_binary_fit(self, c, learners, data, **kwargs):
|
||||
def _delayed_binary_quantify(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence
|
||||
|
||||
def _delayed_binary_aggregate(self, c, classif_predictions):
|
||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
|
||||
|
||||
def _delayed_binary_fit(self, c, data, **kwargs):
|
||||
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||
learners[c].fit(bindata, **kwargs)
|
||||
self.dict_binary_quantifiers[c].fit(bindata, **kwargs)
|
||||
|
||||
|
||||
class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||
"""
|
||||
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
||||
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
||||
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||
"""
|
||||
|
||||
def __init__(self, svmperf_base, loss, **kwargs):
|
||||
self.svmperf_base = svmperf_base
|
||||
self.loss = loss
|
||||
self.kwargs = kwargs
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||
assert fit_learner, 'the method requires that fit_learner=True'
|
||||
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||
if not data.binary:
|
||||
self.learner = OneVsAll(self.learner, n_jobs=-1)
|
||||
return self.learner.fit(data, *args)
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
return self.learner.quantify(instances, *args)
|
||||
# class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||
# """
|
||||
# A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
||||
# quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
||||
# This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
# Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||
# """
|
||||
#
|
||||
# def __init__(self, svmperf_base, loss, **kwargs):
|
||||
# self.svmperf_base = svmperf_base
|
||||
# self.loss = loss
|
||||
# self.kwargs = kwargs
|
||||
#
|
||||
# def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||
# assert fit_learner, 'the method requires that fit_learner=True'
|
||||
# self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||
# if not data.binary:
|
||||
# self.learner = OneVsAll(self.learner, n_jobs=-1)
|
||||
# return self.learner.fit(data, *args)
|
||||
#
|
||||
# def aggregate(self, instances, *args):
|
||||
# return self.learner.aggregate(instances, *args)
|
||||
|
||||
|
||||
class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
||||
|
@ -398,38 +419,35 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
|||
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
|
||||
return self
|
||||
|
||||
def quantify(self, X, y=None):
|
||||
predictions = self.learner.predict(X)
|
||||
prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
|
||||
print('binary: ', prev)
|
||||
return prev
|
||||
def aggregate(self, classif_predictions:np.ndarray, *args):
|
||||
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_)
|
||||
|
||||
def classify(self, X, y=None):
|
||||
return self.learner.predict(X)
|
||||
|
||||
|
||||
|
||||
class SVMQ(ExplicitLossMinimisation):
|
||||
class SVMQ(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||
|
||||
|
||||
class SVMKLD(ExplicitLossMinimisation):
|
||||
class SVMKLD(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
||||
|
||||
|
||||
class SVMNKLD(ExplicitLossMinimisation):
|
||||
class SVMNKLD(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
||||
|
||||
|
||||
class SVMAE(ExplicitLossMinimisation):
|
||||
class SVMAE(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
|
||||
|
||||
|
||||
class SVMRAE(ExplicitLossMinimisation):
|
||||
class SVMRAE(ExplicitLossMinimisationBinary):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
||||
|
||||
|
@ -438,7 +456,7 @@ CC = ClassifyAndCount
|
|||
ACC = AdjustedClassifyAndCount
|
||||
PCC = ProbabilisticClassifyAndCount
|
||||
PACC = ProbabilisticAdjustedClassifyAndCount
|
||||
ELM = ExplicitLossMinimisation
|
||||
ELM = ExplicitLossMinimisationBinary
|
||||
EMQ = ExpectationMaximizationQuantifier
|
||||
HDy = HellingerDistanceY
|
||||
|
||||
|
|
|
@ -18,3 +18,48 @@ class BaseQuantifier(metaclass=ABCMeta):
|
|||
def get_params(self, deep=True): ...
|
||||
|
||||
|
||||
# class OneVsAll(AggregativeQuantifier):
|
||||
# """
|
||||
# Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
|
||||
# quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||
# """
|
||||
#
|
||||
# def __init__(self, binary_method, n_jobs=-1):
|
||||
# self.binary_method = binary_method
|
||||
# self.n_jobs = n_jobs
|
||||
#
|
||||
# def fit(self, data: LabelledCollection, **kwargs):
|
||||
# assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
||||
# assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
||||
# self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
||||
# Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
# delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
||||
# )
|
||||
# return self
|
||||
#
|
||||
# def quantify(self, X, *args):
|
||||
# prevalences = np.asarray(
|
||||
# Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
# delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
|
||||
# )
|
||||
# )
|
||||
# return F.normalize_prevalence(prevalences)
|
||||
#
|
||||
# @property
|
||||
# def classes(self):
|
||||
# return sorted(self.class_method.keys())
|
||||
#
|
||||
# def set_params(self, **parameters):
|
||||
# self.binary_method.set_params(**parameters)
|
||||
#
|
||||
# def get_params(self, deep=True):
|
||||
# return self.binary_method.get_params()
|
||||
#
|
||||
# def _delayed_binary_predict(self, c, learners, X):
|
||||
# return learners[c].quantify(X)[:,1] # the mean is the estimation for the positive class prevalence
|
||||
#
|
||||
# def _delayed_binary_fit(self, c, learners, data, **kwargs):
|
||||
# bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||
# learners[c].fit(bindata, **kwargs)
|
||||
|
||||
|
||||
|
|
|
@ -3,6 +3,10 @@ import multiprocessing
|
|||
from joblib import Parallel, delayed
|
||||
import contextlib
|
||||
import numpy as np
|
||||
import urllib
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -33,3 +37,27 @@ def temp_seed(seed):
|
|||
finally:
|
||||
np.random.set_state(state)
|
||||
|
||||
|
||||
def download_file(url, archive_filename):
|
||||
def progress(blocknum, bs, size):
|
||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
|
||||
print("Downloading %s" % url)
|
||||
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
||||
print("")
|
||||
|
||||
|
||||
def download_file_if_not_exists(url, archive_path):
|
||||
if os.path.exists(archive_path):
|
||||
return
|
||||
create_if_not_exist(os.path.dirname(archive_path))
|
||||
download_file(url,archive_path)
|
||||
|
||||
|
||||
def create_if_not_exist(path):
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
|
||||
def get_quapy_home():
|
||||
return os.path.join(str(Path.home()), 'quapy_data')
|
44
test.py
44
test.py
|
@ -2,37 +2,45 @@ from sklearn.linear_model import LogisticRegression
|
|||
from sklearn.svm import LinearSVC
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
import sys
|
||||
|
||||
#qp.datasets.fetch_reviews('hp')
|
||||
#qp.datasets.fetch_twitter('sst')
|
||||
|
||||
#sys.exit()
|
||||
|
||||
SAMPLE_SIZE=500
|
||||
binary = False
|
||||
svmperf_home = './svm_perf_quantification'
|
||||
|
||||
if binary:
|
||||
# load a textual binary dataset and create a tfidf bag of words
|
||||
train_path = './datasets/reviews/kindle/train.txt'
|
||||
test_path = './datasets/reviews/kindle/test.txt'
|
||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
|
||||
qp.preprocessing.text2tfidf(dataset, inplace=True)
|
||||
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
||||
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||
|
||||
else:
|
||||
# load a sparse matrix ternary dataset
|
||||
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
||||
test_path = './datasets/twitter/test/sst.test.feature.txt'
|
||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
||||
dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10)
|
||||
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
|
||||
|
||||
print('dataset loaded')
|
||||
|
||||
# training a quantifier
|
||||
learner = LogisticRegression()
|
||||
model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
|
||||
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
||||
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
|
||||
model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
|
||||
|
||||
if not binary:
|
||||
model = qp.method.aggregative.OneVsAll(model)
|
||||
|
||||
print('fitting model')
|
||||
model.fit(dataset.training)
|
||||
|
||||
|
||||
# estimating class prevalences
|
||||
print('quantifying')
|
||||
prevalences_estim = model.quantify(dataset.test.instances)
|
||||
prevalences_true = dataset.test.prevalence()
|
||||
|
||||
|
@ -46,9 +54,17 @@ print(f'true prevalence {F.strprev(prevalences_true)}')
|
|||
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
||||
print(f'mae={error:.3f}')
|
||||
|
||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE)
|
||||
|
||||
qp.error.SAMPLE_SIZE=SAMPLE_SIZE
|
||||
max_evaluations = 5000
|
||||
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
|
||||
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
|
||||
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n'
|
||||
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
|
||||
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
|
||||
|
||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
|
||||
|
||||
qp.error.SAMPLE_SIZE = SAMPLE_SIZE
|
||||
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
|
||||
for error in qp.error.QUANTIFICATION_ERROR:
|
||||
score = error(true_prev, estim_prev)
|
||||
|
|
Loading…
Reference in New Issue