1
0
Fork 0

evaluation by artificial prevalence sampling added. New methods added. New util functions added to quapy.functional and quapy.utils

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-10 19:04:33 +01:00
parent a882424eeb
commit 9bc3a9f28a
17 changed files with 444 additions and 110 deletions

View File

@ -1,3 +1,3 @@
# QuaPy # QuaPy
A Python framework for Quantification A Quantification framework written in Python.

View File

@ -1,3 +1,8 @@
Documentation with sphinx Documentation with sphinx
The parallel training in svmperf seems not to work Add evaluation - artificial sampling
Add "prepare svmperf for quantification" script Add quantification_report (akin to classification_report from sklearn)
Add optimization - artificial sampling
Add prediction - artificial sampling
Add readers for typical datasets used in Quantification
Add NAE, NRAE
Add "measures for evaluating ordinal"?

View File

@ -1,6 +1,5 @@
from .dataset import * from .data import *
from . import functional from . import functional
from . import method from . import method
from . import error from . import error
from . import evaluation

View File

@ -20,12 +20,9 @@ class SVMperf(BaseEstimator, ClassifierMixin):
self.verbose = verbose self.verbose = verbose
self.loss = loss self.loss = loss
def set_c(self, C):
self.param_C = '-c ' + str(C)
def set_params(self, **parameters): def set_params(self, **parameters):
assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported' assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
self.set_c(parameters['C']) self.C = parameters['C']
def fit(self, X, y): def fit(self, X, y):
assert self.loss in SVMperf.valid_losses, \ assert self.loss in SVMperf.valid_losses, \
@ -33,8 +30,8 @@ class SVMperf(BaseEstimator, ClassifierMixin):
self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn') self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn')
self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify') self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify')
self.loss_cmd = '-l ' + str(self.valid_losses[self.loss]) self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss])
self.set_c(self.C) self.c_cmd = '-c ' + str(self.C)
self.classes_ = sorted(np.unique(y)) self.classes_ = sorted(np.unique(y))
self.n_classes_ = len(self.classes_) self.n_classes_ = len(self.classes_)
@ -49,7 +46,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
dump_svmlight_file(X, y, traindat, zero_based=False) dump_svmlight_file(X, y, traindat, zero_based=False)
cmd = ' '.join([self.svmperf_learn, self.param_C, self.loss_cmd, traindat, self.model]) cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model])
if self.verbose: if self.verbose:
print('[Running]', cmd) print('[Running]', cmd)
p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT) p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT)
@ -60,7 +57,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
return self return self
def predict(self, X, y=None): def predict(self, X):
confidence_scores = self.decision_function(X) confidence_scores = self.decision_function(X)
predictions = (confidence_scores > 0) * 1 predictions = (confidence_scores > 0) * 1
return predictions return predictions

View File

@ -22,12 +22,6 @@ class LabelledCollection:
def load(cls, path:str, loader_func:callable): def load(cls, path:str, loader_func:callable):
return LabelledCollection(*loader_func(path)) return LabelledCollection(*loader_func(path))
@classmethod
def load_dataset(cls, train_path, test_path):
training = cls.load(train_path)
test = cls.load(test_path)
return Dataset(training, test)
def __len__(self): def __len__(self):
return self.instances.shape[0] return self.instances.shape[0]
@ -43,13 +37,13 @@ class LabelledCollection:
@property @property
def binary(self): def binary(self):
return self.n_classes==2 return self.n_classes == 2
def sampling_index(self, size, *prevs, shuffle=True): def sampling_index(self, size, *prevs, shuffle=True):
if len(prevs) == self.n_classes-1: if len(prevs) == self.n_classes-1:
prevs = prevs + (1-sum(prevs),) prevs = prevs + (1-sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})' assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
taken = 0 taken = 0
indexes_sample = [] indexes_sample = []
@ -93,6 +87,11 @@ class LabelledCollection:
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling(sample_size, *prevs) yield self.sampling(sample_size, *prevs)
def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
dimensions=self.n_classes
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling_index(sample_size, *prevs)
def __add__(self, other): def __add__(self, other):
if issparse(self.instances) and issparse(other.documents): if issparse(self.instances) and issparse(other.documents):
docs = vstack([self.instances, other.documents]) docs = vstack([self.instances, other.documents])

View File

@ -1,9 +1,10 @@
import numpy as np import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from dataset.base import Dataset from data.base import Dataset
from scipy.sparse import spmatrix from scipy.sparse import spmatrix
from utils.util import parallelize from utils.util import parallelize
from .base import LabelledCollection from .base import LabelledCollection
from tqdm import tqdm
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
consisting of lists of integer values representing indices. consisting of lists of integer values representing indices.
""" """
__check_type(dataset.training.instances, list, str) __check_type(dataset.training.instances, np.ndarray, str)
__check_type(dataset.test.instances, list, str) __check_type(dataset.test.instances, np.ndarray, str)
indexer = IndexTransformer(min_df=min_df, **kwargs) indexer = IndexTransformer(min_df=min_df, **kwargs)
training_index = indexer.fit_transform(dataset.training.instances) training_index = indexer.fit_transform(dataset.training.instances)
@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None):
f'unexpected type of element (expected {container_type}, found {type(container)})' f'unexpected type of element (expected {container_type}, found {type(container)})'
class IndexTransformer: class IndexTransformer:
def __init__(self, **kwargs): def __init__(self, **kwargs):
@ -140,7 +140,7 @@ class IndexTransformer:
return self.fit(X).transform(X, n_jobs=n_jobs) return self.fit(X).transform(X, n_jobs=n_jobs)
def vocabulary_size(self): def vocabulary_size(self):
return len(self.vocabulary_) + 1 # the reserved unk token return len(self.vocabulary_)
def add_word(self, word): def add_word(self, word):
if word in self.vocabulary_: if word in self.vocabulary_:

View File

@ -1,5 +1,8 @@
from sklearn.metrics import f1_score from sklearn.metrics import f1_score
from settings import SAMPLE_SIZE import numpy as np
SAMPLE_SIZE = None
def f1e(y_true, y_pred): def f1e(y_true, y_pred):
@ -7,8 +10,7 @@ def f1e(y_true, y_pred):
def acce(y_true, y_pred): def acce(y_true, y_pred):
acc = (y_true == y_pred).mean() return 1. - (y_true == y_pred).mean()
return 1. - acc
def mae(prevs, prevs_hat): def mae(prevs, prevs_hat):
@ -20,11 +22,40 @@ def ae(p, p_hat):
return abs(p_hat-p).mean(axis=-1) return abs(p_hat-p).mean(axis=-1)
def mrae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)): def mse(prevs, prevs_hat):
return se(prevs, prevs_hat).mean()
def se(p, p_hat):
return ((p_hat-p)**2).mean(axis=-1)
def mkld(prevs, prevs_hat):
return kld(prevs, prevs_hat).mean()
def kld(p, p_hat, eps=None):
eps = __check_eps(eps)
sp = p+eps
sp_hat = p_hat + eps
return (sp*np.log(sp/sp_hat)).sum(axis=-1)
def mnkld(prevs, prevs_hat):
return nkld(prevs, prevs_hat).mean()
def nkld(p, p_hat, eps=None):
ekld = np.exp(kld(p, p_hat, eps))
return 2. * ekld / (1 + ekld) - 1.
def mrae(p, p_hat, eps=None):
return rae(p, p_hat, eps).mean() return rae(p, p_hat, eps).mean()
def rae(p, p_hat, eps=1./(2. * SAMPLE_SIZE)): def rae(p, p_hat, eps=None):
eps = __check_eps(eps)
p = smooth(p, eps) p = smooth(p, eps)
p_hat = smooth(p_hat, eps) p_hat = smooth(p_hat, eps)
return (abs(p-p_hat)/p).mean(axis=-1) return (abs(p-p_hat)/p).mean(axis=-1)
@ -35,8 +66,17 @@ def smooth(p, eps):
return (p+eps)/(eps*n_classes + 1) return (p+eps)/(eps*n_classes + 1)
def __check_eps(eps):
if eps is None:
if SAMPLE_SIZE is None:
raise ValueError('eps was not defined, and qp.error.SAMPLE_SIZE was not set')
else:
eps = 1. / (2. * SAMPLE_SIZE)
return eps
CLASSIFICATION_ERROR = {f1e, acce} CLASSIFICATION_ERROR = {f1e, acce}
QUANTIFICATION_ERROR = {mae, mrae} QUANTIFICATION_ERROR = {mae, mrae, mse, mkld, mnkld}
f1_error = f1e f1_error = f1e
acc_error = acce acc_error = acce

53
quapy/evaluation.py Normal file
View File

@ -0,0 +1,53 @@
from data import LabelledCollection
from method.base import BaseQuantifier
from utils.util import temp_seed
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
def artificial_sampling_prediction(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
prevalence_points=21,
point_repetitions=1,
n_jobs=-1,
random_seed=42):
"""
Performs the predictions for all samples generated according to the artificial sampling protocol.
:param model: the model in charge of generating the class prevalence estimations
:param test: the test set on which to perform arificial sampling
:param sample_size: the size of the samples
:param prevalence_points: the number of different prevalences to sample
:param point_repetitions: the number of repetitions for each prevalence
:param n_jobs: number of jobs to be run in parallel
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
any other random process.
:return: two ndarrays of [m,n] with m the number of samples (prevalence_points*point_repetitions) and n the
number of classes. The first one contains the true prevalences for the samples generated while the second one
containing the the prevalences estimations
"""
with temp_seed(random_seed):
indexes = list(test.artificial_sampling_index_generator(sample_size, prevalence_points, point_repetitions))
def _predict_prevalences(index):
sample = test.sampling_from_index(index)
true_prevalence = sample.prevalence()
estim_prevalence = model.quantify(sample.instances)
return true_prevalence, estim_prevalence
results = Parallel(n_jobs=n_jobs)(
delayed(_predict_prevalences)(index) for index in tqdm(indexes)
)
true_prevalences, estim_prevalences = zip(*results)
true_prevalences = np.asarray(true_prevalences)
estim_prevalences = np.asarray(estim_prevalences)
return true_prevalences, estim_prevalences

View File

@ -15,6 +15,26 @@ def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, retur
return prevs return prevs
def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
"""
Produces a uniformly separated values of prevalence. By default, produces an array 21 prevalences, with step 0.05
and with the limits smoothed, i.e.:
[0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99]
:param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21)
:param repeat: number of times each prevalence is to be repeated (defaults to 1)
:param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1
:return: an array of uniformly separated prevalence values
"""
p = np.linspace(0., 1., num=n_prevalences, endpoint=True)
p[0] += smooth_limits_epsilon
p[-1] -= smooth_limits_epsilon
if p[0] > p[1]:
raise ValueError(f'the smoothing in the limits is greater than the prevalence step')
if repeat > 1:
p = np.repeat(p, repeat)
return p
def prevalence_from_labels(labels, n_classes): def prevalence_from_labels(labels, n_classes):
unique, counts = np.unique(labels, return_counts=True) unique, counts = np.unique(labels, return_counts=True)
by_class = defaultdict(lambda:0, dict(zip(unique, counts))) by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
@ -47,3 +67,54 @@ def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True):
return adjusted return adjusted
def normalize_prevalence(prevalences):
assert prevalences.ndim==1, 'unexpected shape'
accum = prevalences.sum()
if accum > 0:
return prevalences / accum
else:
# if all classifiers are trivial rejectors
return np.ones_like(prevalences) / prevalences.size
def num_prevalence_combinations(nclasses:int, nprevpoints:int, nrepeats:int):
"""
Computes the number of prevalence combinations in the nclasses-dimensional simplex if nprevpoints equally distant
prevalences are generated and nrepeats repetitions are requested
:param nclasses: number of classes
:param nprevpoints: number of prevalence points.
:param nrepeats: number of repetitions for each prevalence combination
:return: The number of possible combinations. For example, if nclasses=2, nprevpoints=5, nrepeats=1, then the number
of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0]
"""
__cache={}
def __f(nc,np):
if (nc,np) in __cache:
return __cache[(nc,np)]
if nc==1:
return 1
else:
x = sum([__f(nc-1, np-i) for i in range(np)])
__cache[(nc,np)] = x
return x
return __f(nclasses, nprevpoints) * nrepeats
def get_nprevpoints_approximation(nclasses, nrepeats, combinations_budget):
"""
Searches for the largest number of (equidistant) prevalence points to define for each of the nclasses classe so that
the number of valid prevalences generated as combinations of prevalence points (points in a nclasses-dimensional
simplex) do not exceed combinations_budget.
:param nclasses: number of classes
:param nrepeats: number of repetitions for each prevalence combination
:param combinations_budget: maximum number of combinatios allowed
:return: the largest number of prevalence points that generate less than combinations_budget valid prevalences
"""
assert nclasses>0 and nrepeats>0 and combinations_budget>0, 'parameters must be positive integers'
nprevpoints = 1
while True:
combinations = num_prevalence_combinations(nclasses, nprevpoints, nrepeats)
if combinations > combinations_budget:
return nprevpoints-1
else:
nprevpoints+=1

View File

@ -1,5 +1,6 @@
from . import base
from . import aggregative as agg from . import aggregative as agg
from . import non_aggregative as nagg from . import non_aggregative
AGGREGATIVE_METHODS = { AGGREGATIVE_METHODS = {
@ -9,22 +10,14 @@ AGGREGATIVE_METHODS = {
agg.ProbabilisticAdjustedClassifyAndCount, agg.ProbabilisticAdjustedClassifyAndCount,
agg.ExplicitLossMinimisation, agg.ExplicitLossMinimisation,
agg.ExpectationMaximizationQuantifier, agg.ExpectationMaximizationQuantifier,
agg.HellingerDistanceY
} }
NON_AGGREGATIVE_METHODS = { NON_AGGREGATIVE_METHODS = {
nagg.MaximumLikelihoodPrevalenceEstimation non_aggregative.MaximumLikelihoodPrevalenceEstimation
} }
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
# common alisases
CC = agg.ClassifyAndCount
ACC = agg.AdjustedClassifyAndCount
PCC = agg.ProbabilisticClassifyAndCount
PACC = agg.ProbabilisticAdjustedClassifyAndCount
ELM = agg.ExplicitLossMinimisation
EMQ = agg.ExpectationMaximizationQuantifier
MLPE = nagg.MaximumLikelihoodPrevalenceEstimation

View File

@ -1,12 +1,14 @@
import numpy as np import numpy as np
from .base import * from copy import deepcopy
from ..error import mae
import functional as F import functional as F
from ..classification.svmperf import SVMperf import error
from ..dataset import LabelledCollection from method.base import BaseQuantifier
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.calibration import CalibratedClassifierCV from sklearn.calibration import CalibratedClassifierCV
from joblib import Parallel, delayed from joblib import Parallel, delayed
from abc import abstractmethod
# Abstract classes # Abstract classes
@ -21,8 +23,16 @@ class AggregativeQuantifier(BaseQuantifier):
@abstractmethod @abstractmethod
def fit(self, data: LabelledCollection, fit_learner=True, *args): ... def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
def classify(self, documents): @property
return self.learner.predict(documents) def learner(self):
return self.learner_
@learner.setter
def learner(self, value):
self.learner_ = value
def classify(self, instances):
return self.learner.predict(instances)
def get_params(self, deep=True): def get_params(self, deep=True):
return self.learner.get_params() return self.learner.get_params()
@ -67,12 +77,12 @@ def training_helper(learner,
Training procedure common to all Aggregative Quantifiers. Training procedure common to all Aggregative Quantifiers.
:param learner: the learner to be fit :param learner: the learner to be fit
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner. :param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
:param fit_learner: whether or not to fit the learner :param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
learner is not probabilistic, then a CalibratedCV instance of it is trained) learner is not probabilistic, then a CalibratedCV instance of it is trained)
:param train_val_split: if specified, indicates the proportion of training documents on which to fit the learner :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
or None otherwise) or None otherwise) to be used as a validation set for any subsequent parameter fitting
""" """
if fit_learner: if fit_learner:
if ensure_probabilistic: if ensure_probabilistic:
@ -118,8 +128,8 @@ class ClassifyAndCount(AggregativeQuantifier):
self.learner, _ = training_helper(self.learner, data, fit_learner) self.learner, _ = training_helper(self.learner, data, fit_learner)
return self return self
def quantify(self, documents, *args): def quantify(self, instances, *args):
classification = self.classify(documents) # classify classification = self.classify(instances) # classify
return F.prevalence_from_labels(classification, self.n_classes) # & count return F.prevalence_from_labels(classification, self.n_classes) # & count
@ -138,8 +148,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts() self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
return self return self
def quantify(self, documents, *args): def quantify(self, instances, *args):
prevs_estim = self.cc.quantify(documents) prevs_estim = self.cc.quantify(instances)
# solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim # solve for the linear system Ax = B with A=Pte_cond_estim and B = prevs_estim
A = self.Pte_cond_estim_ A = self.Pte_cond_estim_
B = prevs_estim B = prevs_estim
@ -163,8 +173,8 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
return self return self
def quantify(self, documents, *args): def quantify(self, instances, *args):
posteriors = self.soft_classify(documents) # classify posteriors = self.soft_classify(instances) # classify
prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count prevalences = F.prevalence_from_probabilities(posteriors, binarize=False) # & count
return prevalences return prevalences
@ -186,8 +196,8 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeQuantifier):
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts() self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
return self return self
def quantify(self, documents, *args): def quantify(self, instances, *args):
prevs_estim = self.pcc.quantify(documents) prevs_estim = self.pcc.quantify(instances)
A = self.Pte_cond_estim_ A = self.Pte_cond_estim_
B = prevs_estim B = prevs_estim
try: try:
@ -237,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
# M-step: qs_pos is Ps+1(y=+1) # M-step: qs_pos is Ps+1(y=+1)
qs = ps.mean(axis=0) qs = ps.mean(axis=0)
if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10: if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10:
converged = True converged = True
qs_prev_ = qs qs_prev_ = qs
@ -252,79 +262,149 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
return qs return qs
# todo: from here class HellingerDistanceY(AggregativeProbabilisticQuantifier):
def train_task(c, learners, data): """
learners[c].fit(data.documents, data.labels == c) Implementation of the method based on the Hellinger Distance y (HDy) proposed by
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
estimation based on the Hellinger distance. Information Sciences, 218:146164.
"""
def __init__(self, learner):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
Px = self.soft_classify(validation.instances)
self.Pxy1 = Px[validation.labels == 1]
self.Pxy0 = Px[validation.labels == 0]
return self
def quantify(self, instances, *args):
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
# and the final estimated a priori probability was taken as the median of these 11 estimates."
# (González-Castro, et al., 2013).
Px = self.soft_classify(instances)
prev_estimations = []
for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)
prev_selected, min_dist = None, None
for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0):
Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density
hdy = HellingerDistanceY.HellingerDistance(Px_train, Px_test)
if prev_selected is None or hdy < min_dist:
prev_selected, min_dist = prev, hdy
prev_estimations.append(prev_selected)
pos_class_prev = np.median(prev_estimations)
return np.asarray([1-pos_class_prev, pos_class_prev])
@classmethod
def HellingerDistance(cls, P, Q):
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
def binary_quant_task(c, learners, X): class OneVsAll(AggregativeQuantifier):
predictions_ci = learners[c].predict(X) """
return predictions_ci.mean() # since the predictions array is binary Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
"""
def __init__(self, binary_method, n_jobs=-1):
self.binary_method = binary_method
self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, **kwargs):
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
)
return self
def quantify(self, X, *args):
prevalences = np.asarray(
Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(self._delayed_binary_predict)(c, self.class_method, X) for c in self.classes
)
)
return F.normalize_prevalence(prevalences)
@property
def classes(self):
return sorted(self.class_method.keys())
def set_params(self, **parameters):
self.binary_method.set_params(**parameters)
def get_params(self, deep=True):
return self.binary_method.get_params()
def _delayed_binary_predict(self, c, learners, X):
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
def _delayed_binary_fit(self, c, learners, data, **kwargs):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
learners[c].fit(bindata, **kwargs)
class OneVsAllELM(AggregativeQuantifier): class ExplicitLossMinimisation(AggregativeQuantifier):
"""
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
"""
def __init__(self, svmperf_base, loss, n_jobs=-1, **kwargs): def __init__(self, svmperf_base, loss, **kwargs):
self.svmperf_base = svmperf_base self.svmperf_base = svmperf_base
self.loss = loss self.loss = loss
self.n_jobs = n_jobs
self.kwargs = kwargs self.kwargs = kwargs
def fit(self, data: LabelledCollection, fit_learner=True, *args): def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert fit_learner, 'the method requires that fit_learner=True' assert fit_learner, 'the method requires that fit_learner=True'
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
if not data.binary:
self.learner = OneVsAll(self.learner, n_jobs=-1)
return self.learner.fit(data, *args)
self.learners = {c: SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs) for c in data.classes_} def quantify(self, instances, *args):
Parallel(n_jobs=self.n_jobs, backend='threading')( return self.learner.quantify(instances, *args)
delayed(train_task)(c, self.learners, data) for c in self.learners.keys()
)
return self
def quantify(self, X, y=None):
prevalences = np.asarray(
Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(binary_quant_task)(c, self.learners, X) for c in self.learners.keys()
)
)
prevalences /= prevalences.sum()
return prevalences
@property
def classes(self):
return sorted(self.learners.keys())
def preclassify_collection(self, data: LabelledCollection):
classifications = []
for class_ in data.classes_:
classifications.append(self.learners[class_].predict(data.instances))
classifications = np.vstack(classifications).T
precomputed = LabelledCollection(classifications, data.labels)
return precomputed
def set_params(self, **parameters):
self.kwargs=parameters
def get_params(self, deep=True):
return self.kwargs
class ExplicitLossMinimisation(AggregativeQuantifier): class ExplicitLossMinimisationBinary(AggregativeQuantifier):
def __init__(self, svmperf_base, loss, **kwargs): def __init__(self, svmperf_base, loss, **kwargs):
self.learner = SVMperf(svmperf_base, loss=loss, **kwargs) self.svmperf_base = svmperf_base
self.loss = loss
self.kwargs = kwargs
def fit(self, data: LabelledCollection, fit_learner=True, *args): def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
assert fit_learner, 'the method requires that fit_learner=True' assert fit_learner, 'the method requires that fit_learner=True'
self.learner.fit(data.instances, data.labels) self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
return self return self
def quantify(self, X, y=None): def quantify(self, X, y=None):
predictions = self.learner.predict(X) predictions = self.learner.predict(X)
return F.prevalence_from_labels(predictions, self.learner.n_classes_) prev = F.prevalence_from_labels(predictions, self.learner.n_classes_)
print('binary: ', prev)
return prev
def classify(self, X, y=None): def classify(self, X, y=None):
return self.learner.predict(X) return self.learner.predict(X)
class SVMQ(ExplicitLossMinimisation): class SVMQ(ExplicitLossMinimisation):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
@ -349,3 +429,12 @@ class SVMRAE(ExplicitLossMinimisation):
def __init__(self, svmperf_base, **kwargs): def __init__(self, svmperf_base, **kwargs):
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs) super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
CC = ClassifyAndCount
ACC = AdjustedClassifyAndCount
PCC = ProbabilisticClassifyAndCount
PACC = ProbabilisticAdjustedClassifyAndCount
ELM = ExplicitLossMinimisation
EMQ = ExpectationMaximizationQuantifier
HDy = HellingerDistanceY

View File

@ -1,5 +1,4 @@
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
import quapy as qp
# Base Quantifier abstract class # Base Quantifier abstract class
@ -7,10 +6,10 @@ import quapy as qp
class BaseQuantifier(metaclass=ABCMeta): class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def fit(self, data: qp.LabelledCollection, *args): ... def fit(self, data, *args): ...
@abstractmethod @abstractmethod
def quantify(self, documents, *args): ... def quantify(self, instances, *args): ...
@abstractmethod @abstractmethod
def set_params(self, **parameters): ... def set_params(self, **parameters): ...

1
quapy/utils/__init__.py Normal file
View File

@ -0,0 +1 @@
from . import util

35
quapy/utils/util.py Normal file
View File

@ -0,0 +1,35 @@
import itertools
import multiprocessing
from joblib import Parallel, delayed
import contextlib
import numpy as np
def get_parallel_slices(n_tasks, n_jobs=-1):
if n_jobs == -1:
n_jobs = multiprocessing.cpu_count()
batch = int(n_tasks / n_jobs)
remainder = n_tasks % n_jobs
return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in
range(n_jobs)]
def parallelize(func, args, n_jobs):
args = np.asarray(args)
slices = get_parallel_slices(len(args), n_jobs)
results = Parallel(n_jobs=n_jobs)(
delayed(func)(args[slice_i]) for slice_i in slices
)
return list(itertools.chain.from_iterable(results))
@contextlib.contextmanager
def temp_seed(seed):
state = np.random.get_state()
np.random.seed(seed)
try:
yield
finally:
np.random.set_state(state)

53
test.py Normal file
View File

@ -0,0 +1,53 @@
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
SAMPLE_SIZE=500
binary = False
if binary:
# load a textual binary dataset and create a tfidf bag of words
train_path = './datasets/reviews/kindle/train.txt'
test_path = './datasets/reviews/kindle/test.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
qp.preprocessing.text2tfidf(dataset, inplace=True)
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
else:
# load a sparse matrix ternary dataset
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
test_path = './datasets/twitter/test/sst.test.feature.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
# training a quantifier
learner = LogisticRegression()
model = qp.method.aggregative.ClassifyAndCount(learner)
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
model.fit(dataset.training)
# estimating class prevalences
prevalences_estim = model.quantify(dataset.test.instances)
prevalences_true = dataset.test.prevalence()
# evaluation (one single prediction)
error = qp.error.mae(prevalences_true, prevalences_estim)
print(f'method {model.__class__.__name__}')
print(f'Evaluation in test (1 eval)')
print(f'true prevalence {F.strprev(prevalences_true)}')
print(f'estim prevalence {F.strprev(prevalences_estim)}')
print(f'mae={error:.3f}')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE)
qp.error.SAMPLE_SIZE=SAMPLE_SIZE
print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev)
print(f'{error.__name__}={score:.5f}')