1
0
Fork 0

Bug fixes on use of classes_. Tests.

This commit is contained in:
Andrea Esuli 2021-05-05 17:12:44 +02:00
parent bfbfe08116
commit 5b772c7eda
8 changed files with 177 additions and 104 deletions

View File

@ -2,40 +2,52 @@ import numpy as np
from scipy.sparse import issparse from scipy.sparse import issparse
from scipy.sparse import vstack from scipy.sparse import vstack
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from quapy.functional import artificial_prevalence_sampling, strprev from quapy.functional import artificial_prevalence_sampling, strprev
class LabelledCollection: class LabelledCollection:
'''
A LabelledCollection is a set of objects each with a label associated to it.
'''
def __init__(self, instances, labels, n_classes=None): def __init__(self, instances, labels, classes_=None):
"""
:param instances: list of objects
:param labels: list of labels, same length of instances
:param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
"""
if issparse(instances): if issparse(instances):
self.instances = instances self.instances = instances
elif isinstance(instances, list) and len(instances)>0 and isinstance(instances[0], str): elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
# lists of strings occupy too much as ndarrays (although python-objects add a heavy overload) # lists of strings occupy too much as ndarrays (although python-objects add a heavy overload)
self.instances = np.asarray(instances, dtype=object) self.instances = np.asarray(instances, dtype=object)
else: else:
self.instances = np.asarray(instances) self.instances = np.asarray(instances)
self.labels = np.asarray(labels, dtype=int) self.labels = np.asarray(labels)
n_docs = len(self) n_docs = len(self)
if n_classes is None: if classes_ is None:
self.classes_ = np.unique(self.labels) self.classes_ = np.unique(self.labels)
self.classes_.sort() self.classes_.sort()
else: else:
self.classes_ = np.arange(n_classes) self.classes_ = np.unique(np.asarray(classes_))
self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_} self.classes_.sort()
if len(set(self.labels).difference(set(classes_))) > 0:
raise ValueError('labels contains values not included in classes_')
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
@classmethod @classmethod
def load(cls, path:str, loader_func:callable): def load(cls, path: str, loader_func: callable):
return LabelledCollection(*loader_func(path)) return LabelledCollection(*loader_func(path))
def __len__(self): def __len__(self):
return self.instances.shape[0] return self.instances.shape[0]
def prevalence(self): def prevalence(self):
return self.counts()/len(self) return self.counts() / len(self)
def counts(self): def counts(self):
return np.asarray([len(self.index[ci]) for ci in self.classes_]) return np.asarray([len(self.index[class_]) for class_ in self.classes_])
@property @property
def n_classes(self): def n_classes(self):
@ -48,21 +60,21 @@ class LabelledCollection:
def sampling_index(self, size, *prevs, shuffle=True): def sampling_index(self, size, *prevs, shuffle=True):
if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling
return np.random.choice(len(self), size, replace=False) return np.random.choice(len(self), size, replace=False)
if len(prevs) == self.n_classes-1: if len(prevs) == self.n_classes - 1:
prevs = prevs + (1-sum(prevs),) prevs = prevs + (1 - sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
taken = 0 taken = 0
indexes_sample = [] indexes_sample = []
for i, class_i in enumerate(self.classes_): for i, class_ in enumerate(self.classes_):
if i == self.n_classes-1: if i == self.n_classes - 1:
n_requested = size - taken n_requested = size - taken
else: else:
n_requested = int(size * prevs[i]) n_requested = int(size * prevs[i])
n_candidates = len(self.index[class_i]) n_candidates = len(self.index[class_])
index_sample = self.index[class_i][ index_sample = self.index[class_][
np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
] if n_requested > 0 else [] ] if n_requested > 0 else []
@ -90,21 +102,22 @@ class LabelledCollection:
def sampling_from_index(self, index): def sampling_from_index(self, index):
documents = self.instances[index] documents = self.instances[index]
labels = self.labels[index] labels = self.labels[index]
return LabelledCollection(documents, labels, n_classes=self.n_classes) return LabelledCollection(documents, labels, classes_=self.classes_)
def split_stratified(self, train_prop=0.6, random_state=None): def split_stratified(self, train_prop=0.6, random_state=None):
# with temp_seed(42): # with temp_seed(42):
tr_docs, te_docs, tr_labels, te_labels = \ tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state) train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
random_state=random_state)
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
dimensions=self.n_classes dimensions = self.n_classes
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling(sample_size, *prevs) yield self.sampling(sample_size, *prevs)
def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1): def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
dimensions=self.n_classes dimensions = self.n_classes
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling_index(sample_size, *prevs) yield self.sampling_index(sample_size, *prevs)
@ -142,10 +155,10 @@ class LabelledCollection:
else: else:
nfeats = '?' nfeats = '?'
stats_ = {'instances': ninstances, stats_ = {'instances': ninstances,
'type': instance_type, 'type': instance_type,
'features': nfeats, 'features': nfeats,
'classes': self.n_classes, 'classes': self.classes_,
'prevs': strprev(self.prevalence())} 'prevs': strprev(self.prevalence())}
if show: if show:
print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
@ -155,13 +168,14 @@ class LabelledCollection:
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
for train_index, test_index in kf.split(*self.Xy): for train_index, test_index in kf.split(*self.Xy):
train = self.sampling_from_index(train_index) train = self.sampling_from_index(train_index)
test = self.sampling_from_index(test_index) test = self.sampling_from_index(test_index)
yield train, test yield train, test
class Dataset: class Dataset:
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''): def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections' assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
self.training = training self.training = training
self.test = test self.test = test
self.vocabulary = vocabulary self.vocabulary = vocabulary
@ -172,8 +186,8 @@ class Dataset:
return Dataset(*collection.split_stratified(train_prop=train_size)) return Dataset(*collection.split_stratified(train_prop=train_size))
@property @property
def n_classes(self): def classes_(self):
return self.training.n_classes return self.training.classes_
@property @property
def binary(self): def binary(self):
@ -195,19 +209,15 @@ class Dataset:
print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, ' print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
return {'train': tr_stats ,'test':te_stats} return {'train': tr_stats, 'test': te_stats}
@classmethod @classmethod
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0): def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)): for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})') yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
def isbinary(data): def isbinary(data):
if isinstance(data, Dataset) or isinstance(data, LabelledCollection): if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
return data.binary return data.binary
return False return False

View File

@ -47,7 +47,7 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'yeast'] 'yeast']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False): def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
""" """
Load a Reviews dataset as a Dataset instance, as used in: Load a Reviews dataset as a Dataset instance, as used in:
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
@ -91,7 +91,7 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
return data return data
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False): def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
""" """
Load a Twitter dataset as a Dataset instance, as used in: Load a Twitter dataset as a Dataset instance, as used in:
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
@ -162,12 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
return data return data
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False): def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose) data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0)) return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False): def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
assert dataset_name in UCI_DATASETS, \ assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \

View File

@ -29,13 +29,13 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
test_documents = vectorizer.transform(dataset.test.instances) test_documents = vectorizer.transform(dataset.test.instances)
if inplace: if inplace:
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes) dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_)
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes) dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_)
dataset.vocabulary = vectorizer.vocabulary_ dataset.vocabulary = vectorizer.vocabulary_
return dataset return dataset
else: else:
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes) training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_)
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes) test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_)
return Dataset(training, test, vectorizer.vocabulary_) return Dataset(training, test, vectorizer.vocabulary_)
@ -66,8 +66,8 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
dataset.test.instances = Xte dataset.test.instances = Xte
return dataset return dataset
else: else:
training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes) training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.classes_)
test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes) test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.classes_)
return Dataset(training, test) return Dataset(training, test)
@ -100,13 +100,13 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
test_index = indexer.transform(dataset.test.instances) test_index = indexer.transform(dataset.test.instances)
if inplace: if inplace:
dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes) dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.classes_)
dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes) dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.classes_)
dataset.vocabulary = indexer.vocabulary_ dataset.vocabulary = indexer.vocabulary_
return dataset return dataset
else: else:
training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes) training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.classes_)
test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes) test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.classes_)
return Dataset(training, test, indexer.vocabulary_) return Dataset(training, test, indexer.vocabulary_)

View File

@ -36,12 +36,12 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
return p return p
def prevalence_from_labels(labels, n_classes): def prevalence_from_labels(labels, classes_):
if labels.ndim != 1: if labels.ndim != 1:
raise ValueError(f'param labels does not seem to be a ndarray of label predictions') raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
unique, counts = np.unique(labels, return_counts=True) unique, counts = np.unique(labels, return_counts=True)
by_class = defaultdict(lambda:0, dict(zip(unique, counts))) by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float) prevalences = np.asarray([by_class[class_] for class_ in classes_], dtype=np.float)
prevalences /= prevalences.sum() prevalences /= prevalences.sum()
return prevalences return prevalences
@ -51,7 +51,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities') raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
if binarize: if binarize:
predictions = np.argmax(posteriors, axis=-1) predictions = np.argmax(posteriors, axis=-1)
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1]) return prevalence_from_labels(predictions, np.arange(posteriors.shape[1]))
else: else:
prevalences = posteriors.mean(axis=0) prevalences = posteriors.mean(axis=0)
prevalences /= prevalences.sum() prevalences /= prevalences.sum()

View File

@ -1,6 +1,7 @@
from abc import abstractmethod from abc import abstractmethod
from copy import deepcopy from copy import deepcopy
from typing import Union from typing import Union
import numpy as np import numpy as np
from joblib import Parallel, delayed from joblib import Parallel, delayed
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
@ -8,6 +9,7 @@ from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm from tqdm import tqdm
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
from quapy.classification.svmperf import SVMperf from quapy.classification.svmperf import SVMperf
@ -43,7 +45,7 @@ class AggregativeQuantifier(BaseQuantifier):
return self.aggregate(classif_predictions) return self.aggregate(classif_predictions)
@abstractmethod @abstractmethod
def aggregate(self, classif_predictions:np.ndarray): ... def aggregate(self, classif_predictions: np.ndarray): ...
def get_params(self, deep=True): def get_params(self, deep=True):
return self.learner.get_params() return self.learner.get_params()
@ -84,7 +86,7 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
def set_params(self, **parameters): def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV): if isinstance(self.learner, CalibratedClassifierCV):
parameters = {'base_estimator__'+k:v for k,v in parameters.items()} parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
self.learner.set_params(**parameters) self.learner.set_params(**parameters)
@property @property
@ -98,7 +100,7 @@ def training_helper(learner,
data: LabelledCollection, data: LabelledCollection,
fit_learner: bool = True, fit_learner: bool = True,
ensure_probabilistic=False, ensure_probabilistic=False,
val_split:Union[LabelledCollection, float]=None): val_split: Union[LabelledCollection, float] = None):
""" """
Training procedure common to all Aggregative Quantifiers. Training procedure common to all Aggregative Quantifiers.
:param learner: the learner to be fit :param learner: the learner to be fit
@ -122,13 +124,14 @@ def training_helper(learner,
if isinstance(val_split, float): if isinstance(val_split, float):
if not (0 < val_split < 1): if not (0 < val_split < 1):
raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)') raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
train, unused = data.split_stratified(train_prop=1-val_split) train, unused = data.split_stratified(train_prop=1 - val_split)
elif val_split.__class__.__name__ == LabelledCollection.__name__: #isinstance(val_split, LabelledCollection): elif val_split.__class__.__name__ == LabelledCollection.__name__: # isinstance(val_split, LabelledCollection):
train = data train = data
unused = val_split unused = val_split
else: else:
raise ValueError(f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split ' raise ValueError(
'proportion, or a LabelledCollection indicating the validation split') f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split '
'proportion, or a LabelledCollection indicating the validation split')
else: else:
train, unused = data, None train, unused = data, None
@ -153,7 +156,7 @@ class CC(AggregativeQuantifier):
attributed each of the classes in order to compute class prevalence estimates. attributed each of the classes in order to compute class prevalence estimates.
""" """
def __init__(self, learner:BaseEstimator): def __init__(self, learner: BaseEstimator):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_learner=True):
@ -167,16 +170,16 @@ class CC(AggregativeQuantifier):
return self return self
def aggregate(self, classif_predictions): def aggregate(self, classif_predictions):
return F.prevalence_from_labels(classif_predictions, self.n_classes) return F.prevalence_from_labels(classif_predictions, self.classes_)
class ACC(AggregativeQuantifier): class ACC(AggregativeQuantifier):
def __init__(self, learner:BaseEstimator, val_split=0.4): def __init__(self, learner: BaseEstimator, val_split=0.4):
self.learner = learner self.learner = learner
self.val_split = val_split self.val_split = val_split
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection]=None): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
""" """
Trains a ACC quantifier Trains a ACC quantifier
:param data: the training set :param data: the training set
@ -262,7 +265,7 @@ class PACC(AggregativeProbabilisticQuantifier):
self.learner = learner self.learner = learner
self.val_split = val_split self.val_split = val_split
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=None): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
""" """
Trains a PACC quantifier Trains a PACC quantifier
:param data: the training set :param data: the training set
@ -294,7 +297,8 @@ class PACC(AggregativeProbabilisticQuantifier):
y_ = np.vstack(y_) y_ = np.vstack(y_)
# fit the learner on all data # fit the learner on all data
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, val_split=None) self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True,
val_split=None)
else: else:
self.learner, val_data = training_helper( self.learner, val_data = training_helper(
@ -307,8 +311,8 @@ class PACC(AggregativeProbabilisticQuantifier):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi # document that belongs to yj ends up being classified as belonging to yi
confusion = np.empty(shape=(data.n_classes, data.n_classes)) confusion = np.empty(shape=(data.n_classes, data.n_classes))
for yi in range(data.n_classes): for i,class_ in enumerate(data.classes_):
confusion[yi] = y_[y==yi].mean(axis=0) confusion[i] = y_[y == class_].mean(axis=0)
self.Pte_cond_estim_ = confusion.T self.Pte_cond_estim_ = confusion.T
@ -338,7 +342,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_learner=True):
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True) self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes) self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
return self return self
def aggregate(self, classif_posteriors, epsilon=EPSILON): def aggregate(self, classif_posteriors, epsilon=EPSILON):
@ -366,7 +370,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
# M-step: # M-step:
qs = ps.mean(axis=0) qs = ps.mean(axis=0)
if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s>10: if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s > 10:
converged = True converged = True
qs_prev_ = qs qs_prev_ = qs
@ -389,7 +393,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
self.learner = learner self.learner = learner
self.val_split = val_split self.val_split = val_split
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
""" """
Trains a HDy quantifier Trains a HDy quantifier
:param data: the training set :param data: the training set
@ -405,13 +409,15 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
self._check_binary(data, self.__class__.__name__) self._check_binary(data, self.__class__.__name__)
self.learner, validation = training_helper( self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
Px = self.posterior_probabilities(validation.instances)[:,1] # takes only the P(y=+1|x) Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x)
self.Pxy1 = Px[validation.labels == 1] self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
self.Pxy0 = Px[validation.labels == 0] self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
# pre-compute the histogram for positive and negative examples # pre-compute the histogram for positive and negative examples
self.bins = np.linspace(10, 110, 11, dtype=int) #[10, 20, 30, ..., 100, 110] self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins} self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins} self.bins}
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
self.bins}
return self return self
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
@ -419,12 +425,12 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
# and the final estimated a priori probability was taken as the median of these 11 estimates." # and the final estimated a priori probability was taken as the median of these 11 estimates."
# (González-Castro, et al., 2013). # (González-Castro, et al., 2013).
Px = classif_posteriors[:,1] # takes only the P(y=+1|x) Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
prev_estimations = [] prev_estimations = []
#for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] # for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
#Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) # Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
#Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) # Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
for bins in self.bins: for bins in self.bins:
Pxy0_density = self.Pxy0_density[bins] Pxy0_density = self.Pxy0_density[bins]
Pxy1_density = self.Pxy1_density[bins] Pxy1_density = self.Pxy1_density[bins]
@ -433,14 +439,14 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
prev_selected, min_dist = None, None prev_selected, min_dist = None, None
for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0): for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0):
Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density
hdy = F.HellingerDistance(Px_train, Px_test) hdy = F.HellingerDistance(Px_train, Px_test)
if prev_selected is None or hdy < min_dist: if prev_selected is None or hdy < min_dist:
prev_selected, min_dist = prev, hdy prev_selected, min_dist = prev, hdy
prev_estimations.append(prev_selected) prev_estimations.append(prev_selected)
pos_class_prev = np.median(prev_estimations) class1_prev = np.median(prev_estimations)
return np.asarray([1-pos_class_prev, pos_class_prev]) return np.asarray([1 - class1_prev, class1_prev])
class ELM(AggregativeQuantifier, BinaryQuantifier): class ELM(AggregativeQuantifier, BinaryQuantifier):
@ -457,8 +463,8 @@ class ELM(AggregativeQuantifier, BinaryQuantifier):
self.learner.fit(data.instances, data.labels) self.learner.fit(data.instances, data.labels)
return self return self
def aggregate(self, classif_predictions:np.ndarray): def aggregate(self, classif_predictions: np.ndarray):
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_) return F.prevalence_from_labels(classif_predictions, self.classes_)
def classify(self, X, y=None): def classify(self, X, y=None):
return self.learner.predict(X) return self.learner.predict(X)
@ -470,6 +476,7 @@ class SVMQ(ELM):
Quantification-oriented learning based on reliable classifiers. Quantification-oriented learning based on reliable classifiers.
Pattern Recognition, 48(2):591604. Pattern Recognition, 48(2):591604.
""" """
def __init__(self, svmperf_base=None, **kwargs): def __init__(self, svmperf_base=None, **kwargs):
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs) super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
@ -480,6 +487,7 @@ class SVMKLD(ELM):
Optimizing text quantifiers for multivariate loss functions. Optimizing text quantifiers for multivariate loss functions.
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
""" """
def __init__(self, svmperf_base=None, **kwargs): def __init__(self, svmperf_base=None, **kwargs):
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs) super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
@ -490,6 +498,7 @@ class SVMNKLD(ELM):
Optimizing text quantifiers for multivariate loss functions. Optimizing text quantifiers for multivariate loss functions.
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27. ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
""" """
def __init__(self, svmperf_base=None, **kwargs): def __init__(self, svmperf_base=None, **kwargs):
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs) super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
@ -531,7 +540,7 @@ class OneVsAll(AggregativeQuantifier):
f'{self.__class__.__name__} expect non-binary data' f'{self.__class__.__name__} expect non-binary data'
assert isinstance(self.binary_quantifier, BaseQuantifier), \ assert isinstance(self.binary_quantifier, BaseQuantifier), \
f'{self.binary_quantifier} does not seem to be a Quantifier' f'{self.binary_quantifier} does not seem to be a Quantifier'
assert fit_learner==True, 'fit_learner must be True' assert fit_learner == True, 'fit_learner must be True'
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data) self.__parallel(self._delayed_binary_fit, data)
@ -559,11 +568,11 @@ class OneVsAll(AggregativeQuantifier):
def aggregate(self, classif_predictions_bin): def aggregate(self, classif_predictions_bin):
if self.probabilistic: if self.probabilistic:
assert classif_predictions_bin.shape[1]==self.n_classes and classif_predictions_bin.shape[2]==2, \ assert classif_predictions_bin.shape[1] == self.n_classes and classif_predictions_bin.shape[2] == 2, \
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \ 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
'probabilities (2 dimensions) for each document (row) and class (columns)' 'probabilities (2 dimensions) for each document (row) and class (columns)'
else: else:
assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \ assert set(np.unique(classif_predictions_bin)).issubset({0, 1}), \
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \ 'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
'predictions for each document (row) and class (columns)' 'predictions for each document (row) and class (columns)'
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin) prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
@ -606,7 +615,7 @@ class OneVsAll(AggregativeQuantifier):
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1] return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
def _delayed_binary_fit(self, c, data): def _delayed_binary_fit(self, c, data):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
self.dict_binary_quantifiers[c].fit(bindata) self.dict_binary_quantifiers[c].fit(bindata)
@property @property
@ -616,9 +625,3 @@ class OneVsAll(AggregativeQuantifier):
@property @property
def probabilistic(self): def probabilistic(self):
return self.binary_quantifier.probabilistic return self.binary_quantifier.probabilistic

View File

@ -19,8 +19,8 @@ class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod @abstractmethod
def get_params(self, deep=True): ... def get_params(self, deep=True): ...
@abstractmethod
@property @property
@abstractmethod
def classes_(self): ... def classes_(self): ...
# these methods allows meta-learners to reimplement the decision based on their constituents, and not # these methods allows meta-learners to reimplement the decision based on their constituents, and not

View File

@ -7,7 +7,11 @@ from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DA
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS) @pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
def test_fetch_reviews(dataset_name): def test_fetch_reviews(dataset_name):
dataset = fetch_reviews(dataset_name) dataset = fetch_reviews(dataset_name)
print(dataset.n_classes, len(dataset.training), len(dataset.test)) print(f'Dataset {dataset_name}')
print('Training set stats')
dataset.training.stats()
print('Test set stats')
dataset.test.stats()
@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN) @pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN)
@ -18,7 +22,10 @@ def test_fetch_twitter(dataset_name):
if dataset_name == 'semeval' and ve.args[0].startswith( if dataset_name == 'semeval' and ve.args[0].startswith(
'dataset "semeval" can only be used for model selection.'): 'dataset "semeval" can only be used for model selection.'):
dataset = fetch_twitter(dataset_name, for_model_selection=True) dataset = fetch_twitter(dataset_name, for_model_selection=True)
print(dataset.n_classes, len(dataset.training), len(dataset.test)) print(f'Dataset {dataset_name}')
print('Training set stats')
dataset.training.stats()
print('Test set stats')
@pytest.mark.parametrize('dataset_name', UCI_DATASETS) @pytest.mark.parametrize('dataset_name', UCI_DATASETS)
@ -28,5 +35,9 @@ def test_fetch_UCIDataset(dataset_name):
except FileNotFoundError as fnfe: except FileNotFoundError as fnfe:
if dataset_name == 'pageblocks.5' and fnfe.args[0].find( if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
'If this is the first time you attempt to load this dataset') > 0: 'If this is the first time you attempt to load this dataset') > 0:
print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.')
return return
print(dataset.n_classes, len(dataset.training), len(dataset.test)) print(f'Dataset {dataset_name}')
print('Training set stats')
dataset.training.stats()
print('Test set stats')

View File

@ -1,23 +1,23 @@
import numpy import numpy
import pytest import pytest
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
import quapy as qp import quapy as qp
from quapy.data import Dataset, LabelledCollection
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS, EXPLICIT_LOSS_MINIMIZATION_METHODS from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS, EXPLICIT_LOSS_MINIMIZATION_METHODS
from quapy.method.meta import Ensemble from quapy.method.meta import Ensemble
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr'), id='hcr'), datasets = [pytest.param(qp.datasets.fetch_twitter('hcr'), id='hcr'),
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')] pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
learners = [LogisticRegression, MultinomialNB, LinearSVC] learners = [LogisticRegression, LinearSVC]
@pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('dataset', datasets)
@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS)) @pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS))
@pytest.mark.parametrize('learner', learners) @pytest.mark.parametrize('learner', learners)
def test_aggregative_methods(dataset, aggregative_method, learner): def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
model = aggregative_method(learner()) model = aggregative_method(learner())
if model.binary and not dataset.binary: if model.binary and not dataset.binary:
@ -36,7 +36,7 @@ def test_aggregative_methods(dataset, aggregative_method, learner):
@pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('dataset', datasets)
@pytest.mark.parametrize('elm_method', EXPLICIT_LOSS_MINIMIZATION_METHODS) @pytest.mark.parametrize('elm_method', EXPLICIT_LOSS_MINIMIZATION_METHODS)
def test_elm_methods(dataset, elm_method): def test_elm_methods(dataset: Dataset, elm_method):
try: try:
model = elm_method() model = elm_method()
except AssertionError as ae: except AssertionError as ae:
@ -60,7 +60,7 @@ def test_elm_methods(dataset, elm_method):
@pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('dataset', datasets)
@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS) @pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS)
def test_non_aggregative_methods(dataset, non_aggregative_method): def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
model = non_aggregative_method() model = non_aggregative_method()
if model.binary and not dataset.binary: if model.binary and not dataset.binary:
@ -81,7 +81,7 @@ def test_non_aggregative_methods(dataset, non_aggregative_method):
@pytest.mark.parametrize('learner', learners) @pytest.mark.parametrize('learner', learners)
@pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('dataset', datasets)
@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES) @pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
def test_ensemble_method(base_method, learner, dataset, policy): def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
qp.environ['SAMPLE_SIZE'] = len(dataset.training) qp.environ['SAMPLE_SIZE'] = len(dataset.training)
model = Ensemble(quantifier=base_method(learner()), size=5, policy=policy, n_jobs=-1) model = Ensemble(quantifier=base_method(learner()), size=5, policy=policy, n_jobs=-1)
if model.binary and not dataset.binary: if model.binary and not dataset.binary:
@ -100,10 +100,12 @@ def test_ensemble_method(base_method, learner, dataset, policy):
def test_quanet_method(): def test_quanet_method():
dataset = qp.datasets.fetch_reviews('kindle', pickle=True) dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()),
dataset.test.sampling(100, *dataset.test.prevalence()))
qp.data.preprocessing.index(dataset, min_df=5, inplace=True) qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
from quapy.classification.neural import CNNnet from quapy.classification.neural import CNNnet
cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) cnn = CNNnet(dataset.vocabulary_size, dataset.training.n_classes)
from quapy.classification.neural import NeuralClassifierTrainer from quapy.classification.neural import NeuralClassifierTrainer
learner = NeuralClassifierTrainer(cnn, device='cuda') learner = NeuralClassifierTrainer(cnn, device='cuda')
@ -123,3 +125,50 @@ def test_quanet_method():
error = qp.error.mae(true_prevalences, estim_prevalences) error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64 assert type(error) == numpy.float64
def models_to_test_for_str_label_names():
models = list()
learner = LogisticRegression
for method in AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS):
models.append(method(learner()))
for method in NON_AGGREGATIVE_METHODS:
models.append(method())
return models
@pytest.mark.parametrize('model', models_to_test_for_str_label_names())
def test_str_label_names(model):
dataset = qp.datasets.fetch_reviews('imdb', pickle=True)
dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()),
dataset.test.sampling(1000, *dataset.test.prevalence()))
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
model.fit(dataset.training)
int_estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, int_estim_prevalences)
assert type(error) == numpy.float64
dataset_str = Dataset(LabelledCollection(dataset.training.instances,
['one' if label == 1 else 'zero' for label in dataset.training.labels]),
LabelledCollection(dataset.test.instances,
['one' if label == 1 else 'zero' for label in dataset.test.labels]))
model.fit(dataset_str.training)
str_estim_prevalences = model.quantify(dataset_str.test.instances)
true_prevalences = dataset_str.test.prevalence()
error = qp.error.mae(true_prevalences, str_estim_prevalences)
assert type(error) == numpy.float64
print(true_prevalences)
print(int_estim_prevalences)
print(str_estim_prevalences)
numpy.testing.assert_almost_equal(int_estim_prevalences[1],
str_estim_prevalences[list(model.classes_).index('one')])