forked from moreo/QuaPy
evaluation by artificial prevalence sampling added. New methods added. New util functions added to quapy.functional and quapy.utils
This commit is contained in:
parent
a882424eeb
commit
9bc3a9f28a
|
@ -1,3 +1,3 @@
|
||||||
# QuaPy
|
# QuaPy
|
||||||
|
|
||||||
A Python framework for Quantification
|
A Quantification framework written in Python.
|
9
TODO.txt
9
TODO.txt
|
@ -1,3 +1,8 @@
|
||||||
Documentation with sphinx
|
Documentation with sphinx
|
||||||
The parallel training in svmperf seems not to work
|
Add evaluation - artificial sampling
|
||||||
Add "prepare svmperf for quantification" script
|
Add quantification_report (akin to classification_report from sklearn)
|
||||||
|
Add optimization - artificial sampling
|
||||||
|
Add prediction - artificial sampling
|
||||||
|
Add readers for typical datasets used in Quantification
|
||||||
|
Add NAE, NRAE
|
||||||
|
Add "measures for evaluating ordinal"?
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from .dataset import *
|
from .data import *
|
||||||
from . import functional
|
from . import functional
|
||||||
from . import method
|
from . import method
|
||||||
from . import error
|
from . import error
|
||||||
|
from . import evaluation
|
||||||
|
|
||||||
|
|
|
@ -20,12 +20,9 @@ class SVMperf(BaseEstimator, ClassifierMixin):
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.loss = loss
|
self.loss = loss
|
||||||
|
|
||||||
def set_c(self, C):
|
|
||||||
self.param_C = '-c ' + str(C)
|
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
|
assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
|
||||||
self.set_c(parameters['C'])
|
self.C = parameters['C']
|
||||||
|
|
||||||
def fit(self, X, y):
|
def fit(self, X, y):
|
||||||
assert self.loss in SVMperf.valid_losses, \
|
assert self.loss in SVMperf.valid_losses, \
|
||||||
|
@ -33,8 +30,8 @@ class SVMperf(BaseEstimator, ClassifierMixin):
|
||||||
|
|
||||||
self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn')
|
self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn')
|
||||||
self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify')
|
self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify')
|
||||||
self.loss_cmd = '-l ' + str(self.valid_losses[self.loss])
|
self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss])
|
||||||
self.set_c(self.C)
|
self.c_cmd = '-c ' + str(self.C)
|
||||||
|
|
||||||
self.classes_ = sorted(np.unique(y))
|
self.classes_ = sorted(np.unique(y))
|
||||||
self.n_classes_ = len(self.classes_)
|
self.n_classes_ = len(self.classes_)
|
||||||
|
@ -49,7 +46,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
|
||||||
|
|
||||||
dump_svmlight_file(X, y, traindat, zero_based=False)
|
dump_svmlight_file(X, y, traindat, zero_based=False)
|
||||||
|
|
||||||
cmd = ' '.join([self.svmperf_learn, self.param_C, self.loss_cmd, traindat, self.model])
|
cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model])
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print('[Running]', cmd)
|
print('[Running]', cmd)
|
||||||
p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT)
|
p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT)
|
||||||
|
@ -60,7 +57,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def predict(self, X, y=None):
|
def predict(self, X):
|
||||||
confidence_scores = self.decision_function(X)
|
confidence_scores = self.decision_function(X)
|
||||||
predictions = (confidence_scores > 0) * 1
|
predictions = (confidence_scores > 0) * 1
|
||||||
return predictions
|
return predictions
|
||||||
|
|
|
@ -22,12 +22,6 @@ class LabelledCollection:
|
||||||
def load(cls, path:str, loader_func:callable):
|
def load(cls, path:str, loader_func:callable):
|
||||||
return LabelledCollection(*loader_func(path))
|
return LabelledCollection(*loader_func(path))
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load_dataset(cls, train_path, test_path):
|
|
||||||
training = cls.load(train_path)
|
|
||||||
test = cls.load(test_path)
|
|
||||||
return Dataset(training, test)
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.instances.shape[0]
|
return self.instances.shape[0]
|
||||||
|
|
||||||
|
@ -49,7 +43,7 @@ class LabelledCollection:
|
||||||
if len(prevs) == self.n_classes-1:
|
if len(prevs) == self.n_classes-1:
|
||||||
prevs = prevs + (1-sum(prevs),)
|
prevs = prevs + (1-sum(prevs),)
|
||||||
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
|
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
|
||||||
assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})'
|
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
|
||||||
|
|
||||||
taken = 0
|
taken = 0
|
||||||
indexes_sample = []
|
indexes_sample = []
|
||||||
|
@ -93,6 +87,11 @@ class LabelledCollection:
|
||||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||||
yield self.sampling(sample_size, *prevs)
|
yield self.sampling(sample_size, *prevs)
|
||||||
|
|
||||||
|
def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
|
||||||
|
dimensions=self.n_classes
|
||||||
|
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||||
|
yield self.sampling_index(sample_size, *prevs)
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
if issparse(self.instances) and issparse(other.documents):
|
if issparse(self.instances) and issparse(other.documents):
|
||||||
docs = vstack([self.instances, other.documents])
|
docs = vstack([self.instances, other.documents])
|
|
@ -1,9 +1,10 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||||
from dataset.base import Dataset
|
from data.base import Dataset
|
||||||
from scipy.sparse import spmatrix
|
from scipy.sparse import spmatrix
|
||||||
from utils.util import parallelize
|
from utils.util import parallelize
|
||||||
from .base import LabelledCollection
|
from .base import LabelledCollection
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
||||||
|
@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
|
||||||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||||
consisting of lists of integer values representing indices.
|
consisting of lists of integer values representing indices.
|
||||||
"""
|
"""
|
||||||
__check_type(dataset.training.instances, list, str)
|
__check_type(dataset.training.instances, np.ndarray, str)
|
||||||
__check_type(dataset.test.instances, list, str)
|
__check_type(dataset.test.instances, np.ndarray, str)
|
||||||
|
|
||||||
indexer = IndexTransformer(min_df=min_df, **kwargs)
|
indexer = IndexTransformer(min_df=min_df, **kwargs)
|
||||||
training_index = indexer.fit_transform(dataset.training.instances)
|
training_index = indexer.fit_transform(dataset.training.instances)
|
||||||
|
@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None):
|
||||||
f'unexpected type of element (expected {container_type}, found {type(container)})'
|
f'unexpected type of element (expected {container_type}, found {type(container)})'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class IndexTransformer:
|
class IndexTransformer:
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
@ -140,7 +140,7 @@ class IndexTransformer:
|
||||||
return self.fit(X).transform(X, n_jobs=n_jobs)
|
return self.fit(X).transform(X, n_jobs=n_jobs)
|
||||||
|
|
||||||
def vocabulary_size(self):
|
def vocabulary_size(self):
|
||||||
return len(self.vocabulary_) + 1 # the reserved unk token
|
return len(self.vocabulary_)
|
||||||
|
|
||||||
def add_word(self, word):
|
def add_word(self, word):
|
||||||
if word in self.vocabulary_:
|
if word in self.vocabulary_:
|
|
@ -1,5 +1,8 @@
|
||||||
from sklearn.metrics import f1_score
|
from sklearn.metrics import f1_score
|
||||||
from settings import SAMPLE_SIZE
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_SIZE = None
|
||||||
|
|
||||||
|
|
||||||
def f1e(y_true, y_pred):
|
def f1e(y_true, y_pred):
|
||||||
|
@ -7,8 +10,7 @@ def f1e(y_true, y_pred):
|
||||||
|
|
||||||
|
|
||||||
def acce(y_true, y_pred):
|
def acce(y_true, y_pred):
|
||||||
acc = (y_true == y_pred).mean()
|
return 1. - (y_true == y_pred).mean()
|
||||||
return 1. - acc
|
|
||||||
|
|
||||||
|
|
||||||
def mae(prevs, prevs_hat):
|
def mae(prevs, prevs_hat):
|
||||||
|
@ -20,11 +22,40 @@ def ae(p, p_hat):
|
||||||
return abs(p_hat-p).mean(axis=-1)
|
return abs(p_hat-p).mean(axis=-1)
|
||||||
|
|
||||||
|
|
||||||
def mrae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)):
|
def mse(prevs, prevs_hat):
|
||||||
|
return se(prevs, prevs_hat).mean()
|
||||||
|
|
||||||
|
|
||||||
|
def se(p, p_hat):
|
||||||
|
return ((p_hat-p)**2).mean(axis=-1)
|
||||||
|
|
||||||
|
|
||||||
|
def mkld(prevs, prevs_hat):
|
||||||
|
return kld(prevs, prevs_hat).mean()
|
||||||
|
|
||||||
|
|
||||||
|
def kld(p, p_hat, eps=None):
|
||||||
|
eps = __check_eps(eps)
|
||||||
|
sp = p+eps
|
||||||
|
sp_hat = p_hat + eps
|
||||||
|
return (sp*np.log(sp/sp_hat)).sum(axis=-1)
|
||||||
|
|
||||||
|
|
||||||
|
def mnkld(prevs, prevs_hat):
|
||||||
|
return nkld(prevs, prevs_hat).mean()
|
||||||
|
|
||||||
|
|
||||||
|
def nkld(p, p_hat, eps=None):
|
||||||
|
ekld = np.exp(kld(p, p_hat, eps))
|
||||||
|
return 2. * ekld / (1 + ekld) - 1.
|
||||||
|
|
||||||
|
|
||||||
|
def mrae(p, p_hat, eps=None):
|
||||||
return rae(p, p_hat, eps).mean()
|
return rae(p, p_hat, eps).mean()
|
||||||
|
|
||||||
|
|
||||||
def rae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)):
|
def rae(p, p_hat, eps=None):
|
||||||
|
eps = __check_eps(eps)
|
||||||
p = smooth(p, eps)
|
p = smooth(p, eps)
|
||||||
p_hat = smooth(p_hat, eps)
|
p_hat = smooth(p_hat, eps)
|
||||||
return (abs(p-p_hat)/p).mean(axis=-1)
|
return (abs(p-p_hat)/p).mean(axis=-1)
|
||||||
|
@ -35,8 +66,17 @@ def smooth(p, eps):
|
||||||
return (p+eps)/(eps*n_classes + 1)
|
return (p+eps)/(eps*n_classes + 1)
|
||||||
|
|
||||||
|
|
||||||
|
def __check_eps(eps):
|
||||||
|
if eps is None:
|
||||||
|
if SAMPLE_SIZE is None:
|
||||||
|
raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set')
|
||||||
|
else:
|
||||||
|
eps = 1. / (2. * SAMPLE_SIZE)
|
||||||
|
return eps
|
||||||
|
|
||||||
|
|
||||||
CLASSIFICATION_ERROR = {f1e, acce}
|
CLASSIFICATION_ERROR = {f1e, acce}
|
||||||
QUANTIFICATION_ERROR = {mae, mrae}
|
QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld}
|
||||||
|
|
||||||
f1_error = f1e
|
f1_error = f1e
|
||||||
acc_error = acce
|
acc_error = acce
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
from data import LabelledCollection
|
||||||
|
from method.base import BaseQuantifier
|
||||||
|
from utils.util import temp_seed
|
||||||
|
import numpy as np
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def artificial_sampling_prediction(
|
||||||
|
model: BaseQuantifier,
|
||||||
|
test: LabelledCollection,
|
||||||
|
sample_size,
|
||||||
|
prevalence_points=21,
|
||||||
|
point_repetitions=1,
|
||||||
|
n_jobs=-1,
|
||||||
|
random_seed=42):
|
||||||
|
"""
|
||||||
|
Performs the predictions for all samples generated according to the artificial sampling protocol.
|
||||||
|
:param model: the model in charge of generating the class prevalence estimations
|
||||||
|
:param test: the test set on which to perform arificial sampling
|
||||||
|
:param sample_size: the size of the samples
|
||||||
|
:param prevalence_points: the number of different prevalences to sample
|
||||||
|
:param point_repetitions: the number of repetitions for each prevalence
|
||||||
|
:param n_jobs: number of jobs to be run in parallel
|
||||||
|
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
||||||
|
any other random process.
|
||||||
|
:return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the
|
||||||
|
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
||||||
|
containing the the prevalences estimations
|
||||||
|
"""
|
||||||
|
|
||||||
|
with temp_seed(random_seed):
|
||||||
|
indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions))
|
||||||
|
|
||||||
|
def _predict_prevalences(index):
|
||||||
|
sample = test.sampling_from_index(index)
|
||||||
|
true_prevalence = sample.prevalence()
|
||||||
|
estim_prevalence = model.quantify(sample.instances)
|
||||||
|
return true_prevalence, estim_prevalence
|
||||||
|
|
||||||
|
results = Parallel(n_jobs=n_jobs)(
|
||||||
|
delayed(_predict_prevalences)(index) for index in tqdm(indexes)
|
||||||
|
)
|
||||||
|
|
||||||
|
true_prevalences, estim_prevalences = zip(*results)
|
||||||
|
true_prevalences = np.asarray(true_prevalences)
|
||||||
|
estim_prevalences = np.asarray(estim_prevalences)
|
||||||
|
|
||||||
|
return true_prevalences, estim_prevalences
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,26 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur
|
||||||
return prevs
|
return prevs
|
||||||
|
|
||||||
|
|
||||||
|
def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
||||||
|
"""
|
||||||
|
Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
|
||||||
|
and with the limits smoothed, i.e.:
|
||||||
|
[0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
|
||||||
|
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
|
||||||
|
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
|
||||||
|
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
|
||||||
|
:return: an array of uniformly separated prevalence values
|
||||||
|
"""
|
||||||
|
p = np.linspace(0., 1., num=n_prevalences, endpoint=True)
|
||||||
|
p[0] += smooth_limits_epsilon
|
||||||
|
p[-1] -= smooth_limits_epsilon
|
||||||
|
if p[0] > p[1]:
|
||||||
|
raise ValueError(f'the smoothing in the limits is greater than the prevalence step')
|
||||||
|
if repeat > 1:
|
||||||
|
p = np.repeat(p, repeat)
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
def prevalence_from_labels(labels, n_classes):
|
def prevalence_from_labels(labels, n_classes):
|
||||||
unique, counts = np.unique(labels, return_counts=True)
|
unique, counts = np.unique(labels, return_counts=True)
|
||||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||||
|
@ -47,3 +67,54 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True):
|
||||||
return adjusted
|
return adjusted
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_prevalence(prevalences):
|
||||||
|
assert prevalences.ndim==1, 'unexpected shape'
|
||||||
|
accum = prevalences.sum()
|
||||||
|
if accum > 0:
|
||||||
|
return prevalences / accum
|
||||||
|
else:
|
||||||
|
# if all classifiers are trivial rejectors
|
||||||
|
return np.ones_like(prevalences) / prevalences.size
|
||||||
|
|
||||||
|
|
||||||
|
def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
|
||||||
|
"""
|
||||||
|
Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant
|
||||||
|
prevalences are generated and nrepeats repetitions are requested
|
||||||
|
:param nclasses: number of classes
|
||||||
|
:param nprevpoints: number of prevalence points.
|
||||||
|
:param nrepeats: number of repetitions for each prevalence combination
|
||||||
|
:return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number
|
||||||
|
of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
|
||||||
|
"""
|
||||||
|
__cache={}
|
||||||
|
def __f(nc,np):
|
||||||
|
if (nc,np) in __cache:
|
||||||
|
return __cache[(nc,np)]
|
||||||
|
if nc==1:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
x = sum([__f(nc-1, np-i) for i in range(np)])
|
||||||
|
__cache[(nc,np)] = x
|
||||||
|
return x
|
||||||
|
return __f(nclasses, nprevpoints) * nrepeats
|
||||||
|
|
||||||
|
|
||||||
|
def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget):
|
||||||
|
"""
|
||||||
|
Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that
|
||||||
|
the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional
|
||||||
|
simplex) do not exceed combinations_budget.
|
||||||
|
:param nclasses: number of classes
|
||||||
|
:param nrepeats: number of repetitions for each prevalence combination
|
||||||
|
:param combinations_budget: maximum number of combinatios allowed
|
||||||
|
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
|
||||||
|
"""
|
||||||
|
assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers'
|
||||||
|
nprevpoints = 1
|
||||||
|
while True:
|
||||||
|
combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats)
|
||||||
|
if combinations > combinations_budget:
|
||||||
|
return nprevpoints-1
|
||||||
|
else:
|
||||||
|
nprevpoints+=1
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
from . import base
|
||||||
from . import aggregative as agg
|
from . import aggregative as agg
|
||||||
from . import non_aggregative as nagg
|
from . import non_aggregative
|
||||||
|
|
||||||
|
|
||||||
AGGREGATIVE_METHODS = {
|
AGGREGATIVE_METHODS = {
|
||||||
|
@ -9,22 +10,14 @@ AGGREGATIVE_METHODS = {
|
||||||
agg.ProbabilisticAdjustedClassifyAndCount,
|
agg.ProbabilisticAdjustedClassifyAndCount,
|
||||||
agg.ExplicitLossMinimisation,
|
agg.ExplicitLossMinimisation,
|
||||||
agg.ExpectationMaximizationQuantifier,
|
agg.ExpectationMaximizationQuantifier,
|
||||||
|
agg.HellingerDistanceY
|
||||||
}
|
}
|
||||||
|
|
||||||
NON_AGGREGATIVE_METHODS = {
|
NON_AGGREGATIVE_METHODS = {
|
||||||
nagg.MaximumLikelihoodPrevalenceEstimation
|
non_aggregative.MaximumLikelihoodPrevalenceEstimation
|
||||||
}
|
}
|
||||||
|
|
||||||
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
|
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
|
||||||
|
|
||||||
|
|
||||||
# common alisases
|
|
||||||
CC = agg.ClassifyAndCount
|
|
||||||
ACC = agg.AdjustedClassifyAndCount
|
|
||||||
PCC = agg.ProbabilisticClassifyAndCount
|
|
||||||
PACC = agg.ProbabilisticAdjustedClassifyAndCount
|
|
||||||
ELM = agg.ExplicitLossMinimisation
|
|
||||||
EMQ = agg.ExpectationMaximizationQuantifier
|
|
||||||
MLPE = nagg.MaximumLikelihoodPrevalenceEstimation
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from .base import *
|
from copy import deepcopy
|
||||||
from ..error import mae
|
|
||||||
import functional as F
|
import functional as F
|
||||||
from ..classification.svmperf import SVMperf
|
import error
|
||||||
from ..dataset import LabelledCollection
|
from method.base import BaseQuantifier
|
||||||
|
from quapy.classification.svmperf import SVMperf
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
|
from abc import abstractmethod
|
||||||
|
|
||||||
|
|
||||||
# Abstract classes
|
# Abstract classes
|
||||||
|
@ -21,8 +23,16 @@ class AggregativeQuantifier(BaseQuantifier):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
|
def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
|
||||||
|
|
||||||
def classify(self, documents):
|
@property
|
||||||
return self.learner.predict(documents)
|
def learner(self):
|
||||||
|
return self.learner_
|
||||||
|
|
||||||
|
@learner.setter
|
||||||
|
def learner(self, value):
|
||||||
|
self.learner_ = value
|
||||||
|
|
||||||
|
def classify(self, instances):
|
||||||
|
return self.learner.predict(instances)
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
return self.learner.get_params()
|
return self.learner.get_params()
|
||||||
|
@ -67,12 +77,12 @@ def training_helper(learner,
|
||||||
Training procedure common to all Aggregative Quantifiers.
|
Training procedure common to all Aggregative Quantifiers.
|
||||||
:param learner: the learner to be fit
|
:param learner: the learner to be fit
|
||||||
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
|
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
|
||||||
:param fit_learner: whether or not to fit the learner
|
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
|
||||||
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
|
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
|
||||||
learner is not probabilistic, then a CalibratedCV instance of it is trained)
|
learner is not probabilistic, then a CalibratedCV instance of it is trained)
|
||||||
:param train_val_split: if specified, indicates the proportion of training documents on which to fit the learner
|
:param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
|
||||||
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
|
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
|
||||||
or None otherwise)
|
or None otherwise) to be used as a validation set for any subsequent parameter fitting
|
||||||
"""
|
"""
|
||||||
if fit_learner:
|
if fit_learner:
|
||||||
if ensure_probabilistic:
|
if ensure_probabilistic:
|
||||||
|
@ -118,8 +128,8 @@ class ClassifyAndCount(AggregativeQuantifier):
|
||||||
self.learner, _ = training_helper(self.learner, data, fit_learner)
|
self.learner, _ = training_helper(self.learner, data, fit_learner)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, documents, *args):
|
def quantify(self, instances, *args):
|
||||||
classification = self.classify(documents) # classify
|
classification = self.classify(instances) # classify
|
||||||
return F.prevalence_from_labels(classification, self.n_classes) # & count
|
return F.prevalence_from_labels(classification, self.n_classes) # & count
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,8 +148,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
|
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, documents, *args):
|
def quantify(self, instances, *args):
|
||||||
prevs_estim = self.cc.quantify(documents)
|
prevs_estim = self.cc.quantify(instances)
|
||||||
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
|
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
|
||||||
A = self.Pte_cond_estim_
|
A = self.Pte_cond_estim_
|
||||||
B = prevs_estim
|
B = prevs_estim
|
||||||
|
@ -163,8 +173,8 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, documents, *args):
|
def quantify(self, instances, *args):
|
||||||
posteriors = self.soft_classify(documents) # classify
|
posteriors = self.soft_classify(instances) # classify
|
||||||
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
|
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
|
||||||
return prevalences
|
return prevalences
|
||||||
|
|
||||||
|
@ -186,8 +196,8 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
|
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, documents, *args):
|
def quantify(self, instances, *args):
|
||||||
prevs_estim = self.pcc.quantify(documents)
|
prevs_estim = self.pcc.quantify(instances)
|
||||||
A = self.Pte_cond_estim_
|
A = self.Pte_cond_estim_
|
||||||
B = prevs_estim
|
B = prevs_estim
|
||||||
try:
|
try:
|
||||||
|
@ -237,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||||
# M-step: qs_pos is Ps+1(y=+1)
|
# M-step: qs_pos is Ps+1(y=+1)
|
||||||
qs = ps.mean(axis=0)
|
qs = ps.mean(axis=0)
|
||||||
|
|
||||||
if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10:
|
if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10:
|
||||||
converged = True
|
converged = True
|
||||||
|
|
||||||
qs_prev_ = qs
|
qs_prev_ = qs
|
||||||
|
@ -252,79 +262,149 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||||
return qs
|
return qs
|
||||||
|
|
||||||
|
|
||||||
# todo: from here
|
class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
||||||
def train_task(c, learners, data):
|
"""
|
||||||
learners[c].fit(data.documents, data.labels == c)
|
Implementation of the method based on the Hellinger Distance y (HDy) proposed by
|
||||||
|
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
|
||||||
|
estimation based on the Hellinger distance. Information Sciences, 218:146–164.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, learner):
|
||||||
|
self.learner = learner
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
||||||
|
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
|
||||||
|
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
||||||
|
self.learner, validation = training_helper(
|
||||||
|
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
||||||
|
Px = self.soft_classify(validation.instances)
|
||||||
|
self.Pxy1 = Px[validation.labels == 1]
|
||||||
|
self.Pxy0 = Px[validation.labels == 0]
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances, *args):
|
||||||
|
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
|
||||||
|
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
||||||
|
# (González-Castro, et al., 2013).
|
||||||
|
|
||||||
|
Px = self.soft_classify(instances)
|
||||||
|
|
||||||
|
prev_estimations = []
|
||||||
|
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
||||||
|
Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
|
||||||
|
Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
|
||||||
|
|
||||||
|
Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
|
||||||
|
|
||||||
|
prev_selected, min_dist = None, None
|
||||||
|
for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0):
|
||||||
|
Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density
|
||||||
|
hdy = HellingerDistanceY.HellingerDistance(Px_train, Px_test)
|
||||||
|
if prev_selected is None or hdy < min_dist:
|
||||||
|
prev_selected, min_dist = prev, hdy
|
||||||
|
prev_estimations.append(prev_selected)
|
||||||
|
|
||||||
|
pos_class_prev = np.median(prev_estimations)
|
||||||
|
return np.asarray([1-pos_class_prev, pos_class_prev])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def HellingerDistance(cls, P, Q):
|
||||||
|
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
|
||||||
|
|
||||||
|
|
||||||
def binary_quant_task(c, learners, X):
|
class OneVsAll(AggregativeQuantifier):
|
||||||
predictions_ci = learners[c].predict(X)
|
"""
|
||||||
return predictions_ci.mean() # since the predictions array is binary
|
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
|
||||||
|
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, binary_method, n_jobs=-1):
|
||||||
|
self.binary_method = binary_method
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, **kwargs):
|
||||||
|
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
||||||
|
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
||||||
|
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
||||||
|
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||||
|
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, X, *args):
|
||||||
|
prevalences = np.asarray(
|
||||||
|
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||||
|
delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return F.normalize_prevalence(prevalences)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes(self):
|
||||||
|
return sorted(self.class_method.keys())
|
||||||
|
|
||||||
|
def set_params(self, **parameters):
|
||||||
|
self.binary_method.set_params(**parameters)
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return self.binary_method.get_params()
|
||||||
|
|
||||||
|
def _delayed_binary_predict(self, c, learners, X):
|
||||||
|
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
|
||||||
|
|
||||||
|
def _delayed_binary_fit(self, c, learners, data, **kwargs):
|
||||||
|
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||||
|
learners[c].fit(bindata, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class OneVsAllELM(AggregativeQuantifier):
|
class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||||
|
"""
|
||||||
|
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
||||||
|
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
||||||
|
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||||
|
Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, svmperf_base, loss, n_jobs=-1, **kwargs):
|
def __init__(self, svmperf_base, loss, **kwargs):
|
||||||
self.svmperf_base = svmperf_base
|
self.svmperf_base = svmperf_base
|
||||||
self.loss = loss
|
self.loss = loss
|
||||||
self.n_jobs = n_jobs
|
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||||
assert fit_learner, 'the method requires that fit_learner=True'
|
assert fit_learner, 'the method requires that fit_learner=True'
|
||||||
|
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||||
|
if not data.binary:
|
||||||
|
self.learner = OneVsAll(self.learner, n_jobs=-1)
|
||||||
|
return self.learner.fit(data, *args)
|
||||||
|
|
||||||
self.learners = {c: SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs) for c in data.classes_}
|
def quantify(self, instances, *args):
|
||||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
return self.learner.quantify(instances, *args)
|
||||||
delayed(train_task)(c, self.learners, data) for c in self.learners.keys()
|
|
||||||
)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def quantify(self, X, y=None):
|
|
||||||
prevalences = np.asarray(
|
|
||||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
|
||||||
delayed(binary_quant_task)(c, self.learners, X) for c in self.learners.keys()
|
|
||||||
)
|
|
||||||
)
|
|
||||||
prevalences /= prevalences.sum()
|
|
||||||
return prevalences
|
|
||||||
|
|
||||||
@property
|
|
||||||
def classes(self):
|
|
||||||
return sorted(self.learners.keys())
|
|
||||||
|
|
||||||
def preclassify_collection(self, data: LabelledCollection):
|
|
||||||
classifications = []
|
|
||||||
for class_ in data.classes_:
|
|
||||||
classifications.append(self.learners[class_].predict(data.instances))
|
|
||||||
classifications = np.vstack(classifications).T
|
|
||||||
precomputed = LabelledCollection(classifications, data.labels)
|
|
||||||
return precomputed
|
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
|
||||||
self.kwargs=parameters
|
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
|
||||||
return self.kwargs
|
|
||||||
|
|
||||||
|
|
||||||
class ExplicitLossMinimisation(AggregativeQuantifier):
|
class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
||||||
|
|
||||||
def __init__(self, svmperf_base, loss, **kwargs):
|
def __init__(self, svmperf_base, loss, **kwargs):
|
||||||
self.learner = SVMperf(svmperf_base, loss=loss, **kwargs)
|
self.svmperf_base = svmperf_base
|
||||||
|
self.loss = loss
|
||||||
|
self.kwargs = kwargs
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||||
|
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
|
||||||
assert fit_learner, 'the method requires that fit_learner=True'
|
assert fit_learner, 'the method requires that fit_learner=True'
|
||||||
self.learner.fit(data.instances, data.labels)
|
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def quantify(self, X, y=None):
|
def quantify(self, X, y=None):
|
||||||
predictions = self.learner.predict(X)
|
predictions = self.learner.predict(X)
|
||||||
return F.prevalence_from_labels(predictions, self.learner.n_classes_)
|
prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
|
||||||
|
print('binary: ', prev)
|
||||||
|
return prev
|
||||||
|
|
||||||
def classify(self, X, y=None):
|
def classify(self, X, y=None):
|
||||||
return self.learner.predict(X)
|
return self.learner.predict(X)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SVMQ(ExplicitLossMinimisation):
|
class SVMQ(ExplicitLossMinimisation):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||||
|
@ -349,3 +429,12 @@ class SVMRAE(ExplicitLossMinimisation):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
CC = ClassifyAndCount
|
||||||
|
ACC = AdjustedClassifyAndCount
|
||||||
|
PCC = ProbabilisticClassifyAndCount
|
||||||
|
PACC = ProbabilisticAdjustedClassifyAndCount
|
||||||
|
ELM = ExplicitLossMinimisation
|
||||||
|
EMQ = ExpectationMaximizationQuantifier
|
||||||
|
HDy = HellingerDistanceY
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
import quapy as qp
|
|
||||||
|
|
||||||
|
|
||||||
# Base Quantifier abstract class
|
# Base Quantifier abstract class
|
||||||
|
@ -7,10 +6,10 @@ import quapy as qp
|
||||||
class BaseQuantifier(metaclass=ABCMeta):
|
class BaseQuantifier(metaclass=ABCMeta):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def fit(self, data: qp.LabelledCollection, *args): ...
|
def fit(self, data, *args): ...
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def quantify(self, documents, *args): ...
|
def quantify(self, instances, *args): ...
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def set_params(self, **parameters): ...
|
def set_params(self, **parameters): ...
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
from . import util
|
|
@ -0,0 +1,35 @@
|
||||||
|
import itertools
|
||||||
|
import multiprocessing
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
import contextlib
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def get_parallel_slices(n_tasks, n_jobs=-1):
|
||||||
|
if n_jobs == -1:
|
||||||
|
n_jobs = multiprocessing.cpu_count()
|
||||||
|
batch = int(n_tasks / n_jobs)
|
||||||
|
remainder = n_tasks % n_jobs
|
||||||
|
return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in
|
||||||
|
range(n_jobs)]
|
||||||
|
|
||||||
|
|
||||||
|
def parallelize(func, args, n_jobs):
|
||||||
|
args = np.asarray(args)
|
||||||
|
slices = get_parallel_slices(len(args), n_jobs)
|
||||||
|
results = Parallel(n_jobs=n_jobs)(
|
||||||
|
delayed(func)(args[slice_i]) for slice_i in slices
|
||||||
|
)
|
||||||
|
return list(itertools.chain.from_iterable(results))
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def temp_seed(seed):
|
||||||
|
state = np.random.get_state()
|
||||||
|
np.random.seed(seed)
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
np.random.set_state(state)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
import quapy as qp
|
||||||
|
import quapy.functional as F
|
||||||
|
|
||||||
|
SAMPLE_SIZE=500
|
||||||
|
binary = False
|
||||||
|
|
||||||
|
if binary:
|
||||||
|
# load a textual binary dataset and create a tfidf bag of words
|
||||||
|
train_path = './datasets/reviews/kindle/train.txt'
|
||||||
|
test_path = './datasets/reviews/kindle/test.txt'
|
||||||
|
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
|
||||||
|
qp.preprocessing.text2tfidf(dataset, inplace=True)
|
||||||
|
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# load a sparse matrix ternary dataset
|
||||||
|
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
||||||
|
test_path = './datasets/twitter/test/sst.test.feature.txt'
|
||||||
|
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
||||||
|
|
||||||
|
# training a quantifier
|
||||||
|
learner = LogisticRegression()
|
||||||
|
model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||||
|
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||||
|
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||||
|
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
|
||||||
|
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
|
||||||
|
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
||||||
|
model.fit(dataset.training)
|
||||||
|
|
||||||
|
# estimating class prevalences
|
||||||
|
prevalences_estim = model.quantify(dataset.test.instances)
|
||||||
|
prevalences_true = dataset.test.prevalence()
|
||||||
|
|
||||||
|
# evaluation (one single prediction)
|
||||||
|
error = qp.error.mae(prevalences_true, prevalences_estim)
|
||||||
|
|
||||||
|
print(f'method {model.__class__.__name__}')
|
||||||
|
|
||||||
|
print(f'Evaluation in test (1 eval)')
|
||||||
|
print(f'true prevalence {F.strprev(prevalences_true)}')
|
||||||
|
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
||||||
|
print(f'mae={error:.3f}')
|
||||||
|
|
||||||
|
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE)
|
||||||
|
|
||||||
|
qp.error.SAMPLE_SIZE=SAMPLE_SIZE
|
||||||
|
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
|
||||||
|
for error in qp.error.QUANTIFICATION_ERROR:
|
||||||
|
score = error(true_prev, estim_prev)
|
||||||
|
print(f'{error.__name__}={score:.5f}')
|
Loading…
Reference in New Issue