Merge pull request #1 from HLT-ISTI/tests_and_classnames
Tests and class names
This commit is contained in:
commit
c280c03fdb
3
TODO.txt
3
TODO.txt
|
@ -17,14 +17,13 @@ Current issues:
|
||||||
In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
|
In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
|
||||||
negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
|
negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
|
||||||
an instance of single-label with 2 labels. Check
|
an instance of single-label with 2 labels. Check
|
||||||
Add classnames to LabelledCollection? This should improve visualization of reports
|
|
||||||
Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps)
|
Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps)
|
||||||
OVR I believe is currently tied to aggregative methods. We should provide a general interface also for general quantifiers
|
OVR I believe is currently tied to aggregative methods. We should provide a general interface also for general quantifiers
|
||||||
Currently, being "binary" only adds one checker; we should figure out how to impose the check to be automatically performed
|
Currently, being "binary" only adds one checker; we should figure out how to impose the check to be automatically performed
|
||||||
|
Add random seed management to support replicability (see temp_seed in util.py).
|
||||||
|
|
||||||
Improvements:
|
Improvements:
|
||||||
==========================================
|
==========================================
|
||||||
Clarify whether QuaNet is an aggregative method or not.
|
|
||||||
Explore the hyperparameter "number of bins" in HDy
|
Explore the hyperparameter "number of bins" in HDy
|
||||||
Rename EMQ to SLD ?
|
Rename EMQ to SLD ?
|
||||||
Parallelize the kFCV in ACC and PACC?
|
Parallelize the kFCV in ACC and PACC?
|
||||||
|
|
|
@ -11,8 +11,8 @@ from torch.nn.utils.rnn import pad_sequence
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from util import EarlyStop
|
from quapy.util import EarlyStop
|
||||||
|
|
||||||
|
|
||||||
class NeuralClassifierTrainer:
|
class NeuralClassifierTrainer:
|
||||||
|
|
|
@ -2,12 +2,21 @@ import numpy as np
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from scipy.sparse import vstack
|
from scipy.sparse import vstack
|
||||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
||||||
|
|
||||||
from quapy.functional import artificial_prevalence_sampling, strprev
|
from quapy.functional import artificial_prevalence_sampling, strprev
|
||||||
|
|
||||||
|
|
||||||
class LabelledCollection:
|
class LabelledCollection:
|
||||||
|
'''
|
||||||
|
A LabelledCollection is a set of objects each with a label associated to it.
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, instances, labels, n_classes=None):
|
def __init__(self, instances, labels, classes_=None):
|
||||||
|
"""
|
||||||
|
:param instances: list of objects
|
||||||
|
:param labels: list of labels, same length of instances
|
||||||
|
:param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
|
||||||
|
"""
|
||||||
if issparse(instances):
|
if issparse(instances):
|
||||||
self.instances = instances
|
self.instances = instances
|
||||||
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
|
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
|
||||||
|
@ -15,14 +24,17 @@ class LabelledCollection:
|
||||||
self.instances = np.asarray(instances, dtype=object)
|
self.instances = np.asarray(instances, dtype=object)
|
||||||
else:
|
else:
|
||||||
self.instances = np.asarray(instances)
|
self.instances = np.asarray(instances)
|
||||||
self.labels = np.asarray(labels, dtype=int)
|
self.labels = np.asarray(labels)
|
||||||
n_docs = len(self)
|
n_docs = len(self)
|
||||||
if n_classes is None:
|
if classes_ is None:
|
||||||
self.classes_ = np.unique(self.labels)
|
self.classes_ = np.unique(self.labels)
|
||||||
self.classes_.sort()
|
self.classes_.sort()
|
||||||
else:
|
else:
|
||||||
self.classes_ = np.arange(n_classes)
|
self.classes_ = np.unique(np.asarray(classes_))
|
||||||
self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_}
|
self.classes_.sort()
|
||||||
|
if len(set(self.labels).difference(set(classes_))) > 0:
|
||||||
|
raise ValueError('labels contains values not included in classes_')
|
||||||
|
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path: str, loader_func: callable):
|
def load(cls, path: str, loader_func: callable):
|
||||||
|
@ -35,7 +47,7 @@ class LabelledCollection:
|
||||||
return self.counts() / len(self)
|
return self.counts() / len(self)
|
||||||
|
|
||||||
def counts(self):
|
def counts(self):
|
||||||
return np.asarray([len(self.index[ci]) for ci in self.classes_])
|
return np.asarray([len(self.index[class_]) for class_ in self.classes_])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_classes(self):
|
def n_classes(self):
|
||||||
|
@ -55,14 +67,14 @@ class LabelledCollection:
|
||||||
|
|
||||||
taken = 0
|
taken = 0
|
||||||
indexes_sample = []
|
indexes_sample = []
|
||||||
for i, class_i in enumerate(self.classes_):
|
for i, class_ in enumerate(self.classes_):
|
||||||
if i == self.n_classes - 1:
|
if i == self.n_classes - 1:
|
||||||
n_requested = size - taken
|
n_requested = size - taken
|
||||||
else:
|
else:
|
||||||
n_requested = int(size * prevs[i])
|
n_requested = int(size * prevs[i])
|
||||||
|
|
||||||
n_candidates = len(self.index[class_i])
|
n_candidates = len(self.index[class_])
|
||||||
index_sample = self.index[class_i][
|
index_sample = self.index[class_][
|
||||||
np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
|
np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
|
||||||
] if n_requested > 0 else []
|
] if n_requested > 0 else []
|
||||||
|
|
||||||
|
@ -90,12 +102,13 @@ class LabelledCollection:
|
||||||
def sampling_from_index(self, index):
|
def sampling_from_index(self, index):
|
||||||
documents = self.instances[index]
|
documents = self.instances[index]
|
||||||
labels = self.labels[index]
|
labels = self.labels[index]
|
||||||
return LabelledCollection(documents, labels, n_classes=self.n_classes)
|
return LabelledCollection(documents, labels, classes_=self.classes_)
|
||||||
|
|
||||||
def split_stratified(self, train_prop=0.6, random_state=None):
|
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||||
# with temp_seed(42):
|
# with temp_seed(42):
|
||||||
tr_docs, te_docs, tr_labels, te_labels = \
|
tr_docs, te_docs, tr_labels, te_labels = \
|
||||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state)
|
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
|
||||||
|
random_state=random_state)
|
||||||
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
||||||
|
|
||||||
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
|
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
|
||||||
|
@ -144,7 +157,7 @@ class LabelledCollection:
|
||||||
stats_ = {'instances': ninstances,
|
stats_ = {'instances': ninstances,
|
||||||
'type': instance_type,
|
'type': instance_type,
|
||||||
'features': nfeats,
|
'features': nfeats,
|
||||||
'classes': self.n_classes,
|
'classes': self.classes_,
|
||||||
'prevs': strprev(self.prevalence())}
|
'prevs': strprev(self.prevalence())}
|
||||||
if show:
|
if show:
|
||||||
print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
|
print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
|
||||||
|
@ -158,10 +171,11 @@ class LabelledCollection:
|
||||||
test = self.sampling_from_index(test_index)
|
test = self.sampling_from_index(test_index)
|
||||||
yield train, test
|
yield train, test
|
||||||
|
|
||||||
|
|
||||||
class Dataset:
|
class Dataset:
|
||||||
|
|
||||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
||||||
assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections'
|
assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
|
||||||
self.training = training
|
self.training = training
|
||||||
self.test = test
|
self.test = test
|
||||||
self.vocabulary = vocabulary
|
self.vocabulary = vocabulary
|
||||||
|
@ -171,6 +185,10 @@ class Dataset:
|
||||||
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
|
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
|
||||||
return Dataset(*collection.split_stratified(train_prop=train_size))
|
return Dataset(*collection.split_stratified(train_prop=train_size))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return self.training.classes_
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_classes(self):
|
def n_classes(self):
|
||||||
return self.training.n_classes
|
return self.training.n_classes
|
||||||
|
@ -207,7 +225,3 @@ def isbinary(data):
|
||||||
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
||||||
return data.binary
|
return data.binary
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ UCI_DATASETS = ['acute.a', 'acute.b',
|
||||||
'yeast']
|
'yeast']
|
||||||
|
|
||||||
|
|
||||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
|
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
||||||
"""
|
"""
|
||||||
Load a Reviews dataset as a Dataset instance, as used in:
|
Load a Reviews dataset as a Dataset instance, as used in:
|
||||||
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
|
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
|
||||||
|
@ -91,7 +91,7 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False):
|
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
||||||
"""
|
"""
|
||||||
Load a Twitter dataset as a Dataset instance, as used in:
|
Load a Twitter dataset as a Dataset instance, as used in:
|
||||||
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||||
|
@ -162,12 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False):
|
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
||||||
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
|
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
|
||||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||||
|
|
||||||
|
|
||||||
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False):
|
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
|
||||||
|
|
||||||
assert dataset_name in UCI_DATASETS, \
|
assert dataset_name in UCI_DATASETS, \
|
||||||
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
||||||
|
|
|
@ -29,13 +29,13 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
|
||||||
test_documents = vectorizer.transform(dataset.test.instances)
|
test_documents = vectorizer.transform(dataset.test.instances)
|
||||||
|
|
||||||
if inplace:
|
if inplace:
|
||||||
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes)
|
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_)
|
||||||
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes)
|
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_)
|
||||||
dataset.vocabulary = vectorizer.vocabulary_
|
dataset.vocabulary = vectorizer.vocabulary_
|
||||||
return dataset
|
return dataset
|
||||||
else:
|
else:
|
||||||
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes)
|
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_)
|
||||||
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes)
|
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_)
|
||||||
return Dataset(training, test, vectorizer.vocabulary_)
|
return Dataset(training, test, vectorizer.vocabulary_)
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,8 +66,8 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
|
||||||
dataset.test.instances = Xte
|
dataset.test.instances = Xte
|
||||||
return dataset
|
return dataset
|
||||||
else:
|
else:
|
||||||
training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes)
|
training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.classes_)
|
||||||
test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes)
|
test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.classes_)
|
||||||
return Dataset(training, test)
|
return Dataset(training, test)
|
||||||
|
|
||||||
|
|
||||||
|
@ -100,13 +100,13 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
|
||||||
test_index = indexer.transform(dataset.test.instances)
|
test_index = indexer.transform(dataset.test.instances)
|
||||||
|
|
||||||
if inplace:
|
if inplace:
|
||||||
dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes)
|
dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.classes_)
|
||||||
dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes)
|
dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.classes_)
|
||||||
dataset.vocabulary = indexer.vocabulary_
|
dataset.vocabulary = indexer.vocabulary_
|
||||||
return dataset
|
return dataset
|
||||||
else:
|
else:
|
||||||
training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes)
|
training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.classes_)
|
||||||
test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes)
|
test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.classes_)
|
||||||
return Dataset(training, test, indexer.vocabulary_)
|
return Dataset(training, test, indexer.vocabulary_)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ from scipy.sparse import dok_matrix
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def from_text(path):
|
def from_text(path, encoding='utf-8'):
|
||||||
"""
|
"""
|
||||||
Reas a labelled colletion of documents.
|
Reas a labelled colletion of documents.
|
||||||
File fomart <0 or 1>\t<document>\n
|
File fomart <0 or 1>\t<document>\n
|
||||||
|
@ -11,7 +11,7 @@ def from_text(path):
|
||||||
:return: a list of sentences, and a list of labels
|
:return: a list of sentences, and a list of labels
|
||||||
"""
|
"""
|
||||||
all_sentences, all_labels = [], []
|
all_sentences, all_labels = [], []
|
||||||
for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
|
for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line:
|
if line:
|
||||||
label, sentence = line.split('\t')
|
label, sentence = line.split('\t')
|
||||||
|
@ -25,8 +25,8 @@ def from_text(path):
|
||||||
|
|
||||||
def from_sparse(path):
|
def from_sparse(path):
|
||||||
"""
|
"""
|
||||||
Reas a labelled colletion of real-valued instances expressed in sparse format
|
Reads a labelled collection of real-valued instances expressed in sparse format
|
||||||
File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n
|
File format <-1 or 0 or 1>[\s col(int):val(float)]\n
|
||||||
:param path: path to the labelled collection
|
:param path: path to the labelled collection
|
||||||
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
|
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
|
||||||
"""
|
"""
|
||||||
|
@ -56,16 +56,16 @@ def from_sparse(path):
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
def from_csv(path):
|
def from_csv(path, encoding='utf-8'):
|
||||||
"""
|
"""
|
||||||
Reas a csv file in which columns are separated by ','.
|
Reads a csv file in which columns are separated by ','.
|
||||||
File fomart <label>,<feat1>,<feat2>,...,<featn>\n
|
File format <label>,<feat1>,<feat2>,...,<featn>\n
|
||||||
:param path: path to the csv file
|
:param path: path to the csv file
|
||||||
:return: a ndarray for the labels and a ndarray (float) for the covariates
|
:return: a ndarray for the labels and a ndarray (float) for the covariates
|
||||||
"""
|
"""
|
||||||
|
|
||||||
X, y = [], []
|
X, y = [], []
|
||||||
for instance in tqdm(open(path, 'rt').readlines(), desc=f'reading {path}'):
|
for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'):
|
||||||
yi, *xi = instance.strip().split(',')
|
yi, *xi = instance.strip().split(',')
|
||||||
X.append(list(map(float,xi)))
|
X.append(list(map(float,xi)))
|
||||||
y.append(yi)
|
y.append(yi)
|
||||||
|
|
|
@ -36,12 +36,12 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
||||||
return p
|
return p
|
||||||
|
|
||||||
|
|
||||||
def prevalence_from_labels(labels, n_classes):
|
def prevalence_from_labels(labels, classes_):
|
||||||
if labels.ndim != 1:
|
if labels.ndim != 1:
|
||||||
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
||||||
unique, counts = np.unique(labels, return_counts=True)
|
unique, counts = np.unique(labels, return_counts=True)
|
||||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||||
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
|
prevalences = np.asarray([by_class[class_] for class_ in classes_], dtype=np.float)
|
||||||
prevalences /= prevalences.sum()
|
prevalences /= prevalences.sum()
|
||||||
return prevalences
|
return prevalences
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
|
||||||
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
|
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
|
||||||
if binarize:
|
if binarize:
|
||||||
predictions = np.argmax(posteriors, axis=-1)
|
predictions = np.argmax(posteriors, axis=-1)
|
||||||
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
|
return prevalence_from_labels(predictions, np.arange(posteriors.shape[1]))
|
||||||
else:
|
else:
|
||||||
prevalences = posteriors.mean(axis=0)
|
prevalences = posteriors.mean(axis=0)
|
||||||
prevalences /= prevalences.sum()
|
prevalences /= prevalences.sum()
|
||||||
|
|
|
@ -3,21 +3,31 @@ from . import base
|
||||||
from . import meta
|
from . import meta
|
||||||
from . import non_aggregative
|
from . import non_aggregative
|
||||||
|
|
||||||
|
EXPLICIT_LOSS_MINIMIZATION_METHODS = {
|
||||||
|
aggregative.ELM,
|
||||||
|
aggregative.SVMQ,
|
||||||
|
aggregative.SVMAE,
|
||||||
|
aggregative.SVMKLD,
|
||||||
|
aggregative.SVMRAE,
|
||||||
|
aggregative.SVMNKLD
|
||||||
|
}
|
||||||
|
|
||||||
AGGREGATIVE_METHODS = {
|
AGGREGATIVE_METHODS = {
|
||||||
aggregative.CC,
|
aggregative.CC,
|
||||||
aggregative.ACC,
|
aggregative.ACC,
|
||||||
aggregative.PCC,
|
aggregative.PCC,
|
||||||
aggregative.PACC,
|
aggregative.PACC,
|
||||||
aggregative.ELM,
|
|
||||||
aggregative.EMQ,
|
aggregative.EMQ,
|
||||||
aggregative.HDy
|
aggregative.HDy
|
||||||
}
|
} | EXPLICIT_LOSS_MINIMIZATION_METHODS
|
||||||
|
|
||||||
|
|
||||||
NON_AGGREGATIVE_METHODS = {
|
NON_AGGREGATIVE_METHODS = {
|
||||||
non_aggregative.MaximumLikelihoodPrevalenceEstimation
|
non_aggregative.MaximumLikelihoodPrevalenceEstimation
|
||||||
}
|
}
|
||||||
|
|
||||||
META_METHODS = {
|
META_METHODS = {
|
||||||
|
meta.Ensemble,
|
||||||
meta.QuaNet
|
meta.QuaNet
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from sklearn.base import BaseEstimator
|
from sklearn.base import BaseEstimator
|
||||||
|
@ -8,6 +9,7 @@ from sklearn.calibration import CalibratedClassifierCV
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
from quapy.classification.svmperf import SVMperf
|
from quapy.classification.svmperf import SVMperf
|
||||||
|
@ -53,10 +55,10 @@ class AggregativeQuantifier(BaseQuantifier):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_classes(self):
|
def n_classes(self):
|
||||||
return len(self.classes)
|
return len(self.classes_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def classes(self):
|
def classes_(self):
|
||||||
return self.learner.classes_
|
return self.learner.classes_
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -127,7 +129,8 @@ def training_helper(learner,
|
||||||
train = data
|
train = data
|
||||||
unused = val_split
|
unused = val_split
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split '
|
raise ValueError(
|
||||||
|
f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split '
|
||||||
'proportion, or a LabelledCollection indicating the validation split')
|
'proportion, or a LabelledCollection indicating the validation split')
|
||||||
else:
|
else:
|
||||||
train, unused = data, None
|
train, unused = data, None
|
||||||
|
@ -167,7 +170,7 @@ class CC(AggregativeQuantifier):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_predictions):
|
def aggregate(self, classif_predictions):
|
||||||
return F.prevalence_from_labels(classif_predictions, self.n_classes)
|
return F.prevalence_from_labels(classif_predictions, self.classes_)
|
||||||
|
|
||||||
|
|
||||||
class ACC(AggregativeQuantifier):
|
class ACC(AggregativeQuantifier):
|
||||||
|
@ -294,7 +297,8 @@ class PACC(AggregativeProbabilisticQuantifier):
|
||||||
y_ = np.vstack(y_)
|
y_ = np.vstack(y_)
|
||||||
|
|
||||||
# fit the learner on all data
|
# fit the learner on all data
|
||||||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, val_split=None)
|
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True,
|
||||||
|
val_split=None)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.learner, val_data = training_helper(
|
self.learner, val_data = training_helper(
|
||||||
|
@ -307,8 +311,8 @@ class PACC(AggregativeProbabilisticQuantifier):
|
||||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||||
# document that belongs to yj ends up being classified as belonging to yi
|
# document that belongs to yj ends up being classified as belonging to yi
|
||||||
confusion = np.empty(shape=(data.n_classes, data.n_classes))
|
confusion = np.empty(shape=(data.n_classes, data.n_classes))
|
||||||
for yi in range(data.n_classes):
|
for i,class_ in enumerate(data.classes_):
|
||||||
confusion[yi] = y_[y==yi].mean(axis=0)
|
confusion[i] = y_[y == class_].mean(axis=0)
|
||||||
|
|
||||||
self.Pte_cond_estim_ = confusion.T
|
self.Pte_cond_estim_ = confusion.T
|
||||||
|
|
||||||
|
@ -338,7 +342,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
|
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||||
|
@ -406,12 +410,14 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
self.learner, validation = training_helper(
|
self.learner, validation = training_helper(
|
||||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
||||||
Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x)
|
Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x)
|
||||||
self.Pxy1 = Px[validation.labels == 1]
|
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
|
||||||
self.Pxy0 = Px[validation.labels == 0]
|
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
|
||||||
# pre-compute the histogram for positive and negative examples
|
# pre-compute the histogram for positive and negative examples
|
||||||
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
||||||
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins}
|
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
|
||||||
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins}
|
self.bins}
|
||||||
|
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
|
||||||
|
self.bins}
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors):
|
def aggregate(self, classif_posteriors):
|
||||||
|
@ -439,8 +445,8 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
prev_selected, min_dist = prev, hdy
|
prev_selected, min_dist = prev, hdy
|
||||||
prev_estimations.append(prev_selected)
|
prev_estimations.append(prev_selected)
|
||||||
|
|
||||||
pos_class_prev = np.median(prev_estimations)
|
class1_prev = np.median(prev_estimations)
|
||||||
return np.asarray([1-pos_class_prev, pos_class_prev])
|
return np.asarray([1 - class1_prev, class1_prev])
|
||||||
|
|
||||||
|
|
||||||
class ELM(AggregativeQuantifier, BinaryQuantifier):
|
class ELM(AggregativeQuantifier, BinaryQuantifier):
|
||||||
|
@ -458,7 +464,7 @@ class ELM(AggregativeQuantifier, BinaryQuantifier):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_predictions: np.ndarray):
|
def aggregate(self, classif_predictions: np.ndarray):
|
||||||
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_)
|
return F.prevalence_from_labels(classif_predictions, self.classes_)
|
||||||
|
|
||||||
def classify(self, X, y=None):
|
def classify(self, X, y=None):
|
||||||
return self.learner.predict(X)
|
return self.learner.predict(X)
|
||||||
|
@ -470,6 +476,7 @@ class SVMQ(ELM):
|
||||||
Quantification-oriented learning based on reliable classifiers.
|
Quantification-oriented learning based on reliable classifiers.
|
||||||
Pattern Recognition, 48(2):591–604.
|
Pattern Recognition, 48(2):591–604.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, svmperf_base=None, **kwargs):
|
def __init__(self, svmperf_base=None, **kwargs):
|
||||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||||
|
|
||||||
|
@ -480,6 +487,7 @@ class SVMKLD(ELM):
|
||||||
Optimizing text quantifiers for multivariate loss functions.
|
Optimizing text quantifiers for multivariate loss functions.
|
||||||
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
|
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, svmperf_base=None, **kwargs):
|
def __init__(self, svmperf_base=None, **kwargs):
|
||||||
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
||||||
|
|
||||||
|
@ -490,6 +498,7 @@ class SVMNKLD(ELM):
|
||||||
Optimizing text quantifiers for multivariate loss functions.
|
Optimizing text quantifiers for multivariate loss functions.
|
||||||
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
|
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, svmperf_base=None, **kwargs):
|
def __init__(self, svmperf_base=None, **kwargs):
|
||||||
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
||||||
|
|
||||||
|
@ -606,7 +615,7 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
||||||
|
|
||||||
def _delayed_binary_fit(self, c, data):
|
def _delayed_binary_fit(self, c, data):
|
||||||
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
|
||||||
self.dict_binary_quantifiers[c].fit(bindata)
|
self.dict_binary_quantifiers[c].fit(bindata)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -616,9 +625,3 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
@property
|
@property
|
||||||
def probabilistic(self):
|
def probabilistic(self):
|
||||||
return self.binary_quantifier.probabilistic
|
return self.binary_quantifier.probabilistic
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,10 @@ class BaseQuantifier(metaclass=ABCMeta):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_params(self, deep=True): ...
|
def get_params(self, deep=True): ...
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def classes_(self): ...
|
||||||
|
|
||||||
# these methods allows meta-learners to reimplement the decision based on their constituents, and not
|
# these methods allows meta-learners to reimplement the decision based on their constituents, and not
|
||||||
# based on class structure
|
# based on class structure
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -186,6 +186,10 @@ class Ensemble(BaseQuantifier):
|
||||||
order = np.argsort(dist)
|
order = np.argsort(dist)
|
||||||
return _select_k(predictions, order, k=self.red_size)
|
return _select_k(predictions, order, k=self.red_size)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return self.base_quantifier.classes_
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def binary(self):
|
def binary(self):
|
||||||
return self.base_quantifier.binary
|
return self.base_quantifier.binary
|
||||||
|
|
|
@ -58,6 +58,7 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
self.device = torch.device(device)
|
self.device = torch.device(device)
|
||||||
|
|
||||||
self.__check_params_colision(self.quanet_params, self.learner.get_params())
|
self.__check_params_colision(self.quanet_params, self.learner.get_params())
|
||||||
|
self._classes_ = None
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
"""
|
"""
|
||||||
|
@ -67,6 +68,7 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
:param fit_learner: if true, trains the classifier on a split containing 40% of the data
|
:param fit_learner: if true, trains the classifier on a split containing 40% of the data
|
||||||
:return: self
|
:return: self
|
||||||
"""
|
"""
|
||||||
|
self._classes_ = data.classes_
|
||||||
classifier_data, unused_data = data.split_stratified(0.4)
|
classifier_data, unused_data = data.split_stratified(0.4)
|
||||||
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
|
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
|
||||||
|
|
||||||
|
@ -256,6 +258,10 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
import shutil
|
import shutil
|
||||||
shutil.rmtree(self.checkpointdir, ignore_errors=True)
|
shutil.rmtree(self.checkpointdir, ignore_errors=True)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return self._classes_
|
||||||
|
|
||||||
|
|
||||||
def mae_loss(output, target):
|
def mae_loss(output, target):
|
||||||
return torch.mean(torch.abs(output - target))
|
return torch.mean(torch.abs(output - target))
|
||||||
|
|
|
@ -2,18 +2,22 @@ from quapy.data import LabelledCollection
|
||||||
from .base import BaseQuantifier
|
from .base import BaseQuantifier
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
pass
|
self._classes_ = None
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, *args):
|
def fit(self, data: LabelledCollection, *args):
|
||||||
|
self._classes_ = data.classes_
|
||||||
self.estimated_prevalence = data.prevalence()
|
self.estimated_prevalence = data.prevalence()
|
||||||
|
|
||||||
def quantify(self, documents, *args):
|
def quantify(self, documents, *args):
|
||||||
return self.estimated_prevalence
|
return self.estimated_prevalence
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return self._classes_
|
||||||
|
|
||||||
def get_params(self):
|
def get_params(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,6 @@ from copy import deepcopy
|
||||||
from typing import Union, Callable
|
from typing import Union, Callable
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
|
||||||
from quapy.data.base import LabelledCollection
|
from quapy.data.base import LabelledCollection
|
||||||
from quapy.evaluation import artificial_sampling_prediction
|
from quapy.evaluation import artificial_sampling_prediction
|
||||||
from quapy.method.aggregative import BaseQuantifier
|
from quapy.method.aggregative import BaseQuantifier
|
||||||
|
@ -118,6 +117,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
def handler(signum, frame):
|
def handler(signum, frame):
|
||||||
self.sout('timeout reached')
|
self.sout('timeout reached')
|
||||||
raise TimeoutError()
|
raise TimeoutError()
|
||||||
|
|
||||||
signal.signal(signal.SIGALRM, handler)
|
signal.signal(signal.SIGALRM, handler)
|
||||||
|
|
||||||
self.sout(f'starting optimization with n_jobs={n_jobs}')
|
self.sout(f'starting optimization with n_jobs={n_jobs}')
|
||||||
|
@ -175,6 +175,10 @@ class GridSearchQ(BaseQuantifier):
|
||||||
def quantify(self, instances):
|
def quantify(self, instances):
|
||||||
return self.best_model_.quantify(instances)
|
return self.best_model_.quantify(instances)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return self.best_model_.classes_
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
self.param_grid = parameters
|
self.param_grid = parameters
|
||||||
|
|
||||||
|
@ -185,4 +189,3 @@ class GridSearchQ(BaseQuantifier):
|
||||||
if hasattr(self, 'best_model_'):
|
if hasattr(self, 'best_model_'):
|
||||||
return self.best_model_
|
return self.best_model_
|
||||||
raise ValueError('best_model called before fit')
|
raise ValueError('best_model called before fit')
|
||||||
|
|
||||||
|
|
|
@ -6,13 +6,38 @@ from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DA
|
||||||
|
|
||||||
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
|
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
|
||||||
def test_fetch_reviews(dataset_name):
|
def test_fetch_reviews(dataset_name):
|
||||||
fetch_reviews(dataset_name)
|
dataset = fetch_reviews(dataset_name)
|
||||||
|
print(f'Dataset {dataset_name}')
|
||||||
|
print('Training set stats')
|
||||||
|
dataset.training.stats()
|
||||||
|
print('Test set stats')
|
||||||
|
dataset.test.stats()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN)
|
@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN)
|
||||||
def test_fetch_twitter(dataset_name):
|
def test_fetch_twitter(dataset_name):
|
||||||
fetch_twitter(dataset_name)
|
try:
|
||||||
|
dataset = fetch_twitter(dataset_name)
|
||||||
|
except ValueError as ve:
|
||||||
|
if dataset_name == 'semeval' and ve.args[0].startswith(
|
||||||
|
'dataset "semeval" can only be used for model selection.'):
|
||||||
|
dataset = fetch_twitter(dataset_name, for_model_selection=True)
|
||||||
|
print(f'Dataset {dataset_name}')
|
||||||
|
print('Training set stats')
|
||||||
|
dataset.training.stats()
|
||||||
|
print('Test set stats')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('dataset_name', UCI_DATASETS)
|
@pytest.mark.parametrize('dataset_name', UCI_DATASETS)
|
||||||
def test_fetch_UCIDataset(dataset_name):
|
def test_fetch_UCIDataset(dataset_name):
|
||||||
fetch_UCIDataset(dataset_name)
|
try:
|
||||||
|
dataset = fetch_UCIDataset(dataset_name)
|
||||||
|
except FileNotFoundError as fnfe:
|
||||||
|
if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
|
||||||
|
'If this is the first time you attempt to load this dataset') > 0:
|
||||||
|
print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.')
|
||||||
|
return
|
||||||
|
print(f'Dataset {dataset_name}')
|
||||||
|
print('Training set stats')
|
||||||
|
dataset.training.stats()
|
||||||
|
print('Test set stats')
|
||||||
|
|
|
@ -1,24 +1,30 @@
|
||||||
import numpy
|
import numpy
|
||||||
import pytest
|
import pytest
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
|
from quapy.data import Dataset, LabelledCollection
|
||||||
|
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS, EXPLICIT_LOSS_MINIMIZATION_METHODS
|
||||||
|
from quapy.method.aggregative import ACC, PACC, HDy
|
||||||
|
from quapy.method.meta import Ensemble
|
||||||
|
|
||||||
datasets = [qp.datasets.fetch_twitter('semeval16')]
|
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr'), id='hcr'),
|
||||||
|
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
|
||||||
|
|
||||||
aggregative_methods = [qp.method.aggregative.CC, qp.method.aggregative.ACC, qp.method.aggregative.ELM]
|
learners = [LogisticRegression, LinearSVC]
|
||||||
|
|
||||||
learners = [LogisticRegression, MultinomialNB, LinearSVC]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('dataset', datasets)
|
@pytest.mark.parametrize('dataset', datasets)
|
||||||
@pytest.mark.parametrize('aggregative_method', aggregative_methods)
|
@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS))
|
||||||
@pytest.mark.parametrize('learner', learners)
|
@pytest.mark.parametrize('learner', learners)
|
||||||
def test_aggregative_methods(dataset, aggregative_method, learner):
|
def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
|
||||||
model = aggregative_method(learner())
|
model = aggregative_method(learner())
|
||||||
|
|
||||||
|
if model.binary and not dataset.binary:
|
||||||
|
print(f'skipping the test of binary model {type(model)} on non-binary dataset {dataset}')
|
||||||
|
return
|
||||||
|
|
||||||
model.fit(dataset.training)
|
model.fit(dataset.training)
|
||||||
|
|
||||||
estim_prevalences = model.quantify(dataset.test.instances)
|
estim_prevalences = model.quantify(dataset.test.instances)
|
||||||
|
@ -27,3 +33,147 @@ def test_aggregative_methods(dataset, aggregative_method, learner):
|
||||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||||
|
|
||||||
assert type(error) == numpy.float64
|
assert type(error) == numpy.float64
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('dataset', datasets)
|
||||||
|
@pytest.mark.parametrize('elm_method', EXPLICIT_LOSS_MINIMIZATION_METHODS)
|
||||||
|
def test_elm_methods(dataset: Dataset, elm_method):
|
||||||
|
try:
|
||||||
|
model = elm_method()
|
||||||
|
except AssertionError as ae:
|
||||||
|
if ae.args[0].find('does not seem to point to a valid path') > 0:
|
||||||
|
print('Missing SVMperf binary program, skipping test')
|
||||||
|
return
|
||||||
|
|
||||||
|
if model.binary and not dataset.binary:
|
||||||
|
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
|
||||||
|
return
|
||||||
|
|
||||||
|
model.fit(dataset.training)
|
||||||
|
|
||||||
|
estim_prevalences = model.quantify(dataset.test.instances)
|
||||||
|
|
||||||
|
true_prevalences = dataset.test.prevalence()
|
||||||
|
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||||
|
|
||||||
|
assert type(error) == numpy.float64
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('dataset', datasets)
|
||||||
|
@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS)
|
||||||
|
def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
|
||||||
|
model = non_aggregative_method()
|
||||||
|
|
||||||
|
if model.binary and not dataset.binary:
|
||||||
|
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
|
||||||
|
return
|
||||||
|
|
||||||
|
model.fit(dataset.training)
|
||||||
|
|
||||||
|
estim_prevalences = model.quantify(dataset.test.instances)
|
||||||
|
|
||||||
|
true_prevalences = dataset.test.prevalence()
|
||||||
|
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||||
|
|
||||||
|
assert type(error) == numpy.float64
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS))
|
||||||
|
@pytest.mark.parametrize('learner', learners)
|
||||||
|
@pytest.mark.parametrize('dataset', datasets)
|
||||||
|
@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
|
||||||
|
def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
|
||||||
|
qp.environ['SAMPLE_SIZE'] = len(dataset.training)
|
||||||
|
model = Ensemble(quantifier=base_method(learner()), size=5, policy=policy, n_jobs=-1)
|
||||||
|
if model.binary and not dataset.binary:
|
||||||
|
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
|
||||||
|
return
|
||||||
|
|
||||||
|
model.fit(dataset.training)
|
||||||
|
|
||||||
|
estim_prevalences = model.quantify(dataset.test.instances)
|
||||||
|
|
||||||
|
true_prevalences = dataset.test.prevalence()
|
||||||
|
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||||
|
|
||||||
|
assert type(error) == numpy.float64
|
||||||
|
|
||||||
|
|
||||||
|
def test_quanet_method():
|
||||||
|
dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
|
||||||
|
dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()),
|
||||||
|
dataset.test.sampling(100, *dataset.test.prevalence()))
|
||||||
|
qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
|
||||||
|
|
||||||
|
from quapy.classification.neural import CNNnet
|
||||||
|
cnn = CNNnet(dataset.vocabulary_size, dataset.training.n_classes)
|
||||||
|
|
||||||
|
from quapy.classification.neural import NeuralClassifierTrainer
|
||||||
|
learner = NeuralClassifierTrainer(cnn, device='cuda')
|
||||||
|
|
||||||
|
from quapy.method.meta import QuaNet
|
||||||
|
model = QuaNet(learner, sample_size=len(dataset.training), device='cuda')
|
||||||
|
|
||||||
|
if model.binary and not dataset.binary:
|
||||||
|
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
|
||||||
|
return
|
||||||
|
|
||||||
|
model.fit(dataset.training)
|
||||||
|
|
||||||
|
estim_prevalences = model.quantify(dataset.test.instances)
|
||||||
|
|
||||||
|
true_prevalences = dataset.test.prevalence()
|
||||||
|
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||||
|
|
||||||
|
assert type(error) == numpy.float64
|
||||||
|
|
||||||
|
|
||||||
|
def models_to_test_for_str_label_names():
|
||||||
|
models = list()
|
||||||
|
learner = LogisticRegression
|
||||||
|
for method in AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS):
|
||||||
|
models.append(method(learner()))
|
||||||
|
for method in NON_AGGREGATIVE_METHODS:
|
||||||
|
models.append(method())
|
||||||
|
return models
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model', models_to_test_for_str_label_names())
|
||||||
|
def test_str_label_names(model):
|
||||||
|
if type(model) in {ACC, PACC, HDy}:
|
||||||
|
print(
|
||||||
|
f'skipping the test of binary model {type(model)} because it currently does not support random seed control.')
|
||||||
|
return
|
||||||
|
|
||||||
|
dataset = qp.datasets.fetch_reviews('imdb', pickle=True)
|
||||||
|
dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()),
|
||||||
|
dataset.test.sampling(1000, *dataset.test.prevalence()))
|
||||||
|
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
|
||||||
|
|
||||||
|
model.fit(dataset.training)
|
||||||
|
|
||||||
|
int_estim_prevalences = model.quantify(dataset.test.instances)
|
||||||
|
true_prevalences = dataset.test.prevalence()
|
||||||
|
|
||||||
|
error = qp.error.mae(true_prevalences, int_estim_prevalences)
|
||||||
|
assert type(error) == numpy.float64
|
||||||
|
|
||||||
|
dataset_str = Dataset(LabelledCollection(dataset.training.instances,
|
||||||
|
['one' if label == 1 else 'zero' for label in dataset.training.labels]),
|
||||||
|
LabelledCollection(dataset.test.instances,
|
||||||
|
['one' if label == 1 else 'zero' for label in dataset.test.labels]))
|
||||||
|
|
||||||
|
model.fit(dataset_str.training)
|
||||||
|
|
||||||
|
str_estim_prevalences = model.quantify(dataset_str.test.instances)
|
||||||
|
true_prevalences = dataset_str.test.prevalence()
|
||||||
|
|
||||||
|
error = qp.error.mae(true_prevalences, str_estim_prevalences)
|
||||||
|
assert type(error) == numpy.float64
|
||||||
|
|
||||||
|
print(true_prevalences)
|
||||||
|
print(int_estim_prevalences)
|
||||||
|
print(str_estim_prevalences)
|
||||||
|
|
||||||
|
numpy.testing.assert_almost_equal(int_estim_prevalences[1],
|
||||||
|
str_estim_prevalences[list(model.classes_).index('one')])
|
||||||
|
|
Loading…
Reference in New Issue