forked from moreo/QuaPy
Merge branch 'master' of https://github.com/HLT-ISTI/QuaPy
This commit is contained in:
commit
9fd9d096f6
|
@ -11,6 +11,12 @@ used for evaluating quantification methods.
|
|||
QuaPy also integrates commonly used datasets and offers visualization tools
|
||||
for facilitating the analysis and interpretation of results.
|
||||
|
||||
### Installation
|
||||
|
||||
```commandline
|
||||
pip install quapy
|
||||
```
|
||||
|
||||
## A quick example:
|
||||
|
||||
The following script fetchs a Twitter dataset, trains and evaluates an
|
||||
|
|
4
TODO.txt
4
TODO.txt
|
@ -2,7 +2,6 @@ Packaging:
|
|||
==========================================
|
||||
Documentation with sphinx
|
||||
Document methods with paper references
|
||||
allow for "pip install"
|
||||
unit-tests
|
||||
|
||||
New features:
|
||||
|
@ -18,14 +17,13 @@ SVMperf-based learners do not remove temp files in __del__?
|
|||
In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
|
||||
negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
|
||||
an instance of single-label with 2 labels. Check
|
||||
Add classnames to LabelledCollection? This should improve visualization of reports
|
||||
Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps)
|
||||
OVR I believe is currently tied to aggregative methods. We should provide a general interface also for general quantifiers
|
||||
Currently, being "binary" only adds one checker; we should figure out how to impose the check to be automatically performed
|
||||
Add random seed management to support replicability (see temp_seed in util.py).
|
||||
|
||||
Improvements:
|
||||
==========================================
|
||||
Clarify whether QuaNet is an aggregative method or not.
|
||||
Explore the hyperparameter "number of bins" in HDy
|
||||
Rename EMQ to SLD ?
|
||||
Parallelize the kFCV in ACC and PACC?
|
||||
|
|
|
@ -10,7 +10,7 @@ from . import model_selection
|
|||
from . import classification
|
||||
from quapy.method.base import isprobabilistic, isaggregative
|
||||
|
||||
__version__ = '0.1'
|
||||
__version__ = '0.1.5'
|
||||
|
||||
environ = {
|
||||
'SAMPLE_SIZE': None,
|
||||
|
|
|
@ -11,8 +11,8 @@ from torch.nn.utils.rnn import pad_sequence
|
|||
from tqdm import tqdm
|
||||
|
||||
import quapy as qp
|
||||
from data import LabelledCollection
|
||||
from util import EarlyStop
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.util import EarlyStop
|
||||
|
||||
|
||||
class NeuralClassifierTrainer:
|
||||
|
|
|
@ -2,40 +2,52 @@ import numpy as np
|
|||
from scipy.sparse import issparse
|
||||
from scipy.sparse import vstack
|
||||
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
||||
|
||||
from quapy.functional import artificial_prevalence_sampling, strprev
|
||||
|
||||
|
||||
class LabelledCollection:
|
||||
'''
|
||||
A LabelledCollection is a set of objects each with a label associated to it.
|
||||
'''
|
||||
|
||||
def __init__(self, instances, labels, n_classes=None):
|
||||
def __init__(self, instances, labels, classes_=None):
|
||||
"""
|
||||
:param instances: list of objects
|
||||
:param labels: list of labels, same length of instances
|
||||
:param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels.
|
||||
"""
|
||||
if issparse(instances):
|
||||
self.instances = instances
|
||||
elif isinstance(instances, list) and len(instances)>0 and isinstance(instances[0], str):
|
||||
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
|
||||
# lists of strings occupy too much as ndarrays (although python-objects add a heavy overload)
|
||||
self.instances = np.asarray(instances, dtype=object)
|
||||
else:
|
||||
self.instances = np.asarray(instances)
|
||||
self.labels = np.asarray(labels, dtype=int)
|
||||
self.labels = np.asarray(labels)
|
||||
n_docs = len(self)
|
||||
if n_classes is None:
|
||||
if classes_ is None:
|
||||
self.classes_ = np.unique(self.labels)
|
||||
self.classes_.sort()
|
||||
else:
|
||||
self.classes_ = np.arange(n_classes)
|
||||
self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_}
|
||||
self.classes_ = np.unique(np.asarray(classes_))
|
||||
self.classes_.sort()
|
||||
if len(set(self.labels).difference(set(classes_))) > 0:
|
||||
raise ValueError('labels contains values not included in classes_')
|
||||
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
||||
|
||||
@classmethod
|
||||
def load(cls, path:str, loader_func:callable):
|
||||
def load(cls, path: str, loader_func: callable):
|
||||
return LabelledCollection(*loader_func(path))
|
||||
|
||||
def __len__(self):
|
||||
return self.instances.shape[0]
|
||||
|
||||
def prevalence(self):
|
||||
return self.counts()/len(self)
|
||||
return self.counts() / len(self)
|
||||
|
||||
def counts(self):
|
||||
return np.asarray([len(self.index[ci]) for ci in self.classes_])
|
||||
return np.asarray([len(self.index[class_]) for class_ in self.classes_])
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
|
@ -48,21 +60,21 @@ class LabelledCollection:
|
|||
def sampling_index(self, size, *prevs, shuffle=True):
|
||||
if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling
|
||||
return np.random.choice(len(self), size, replace=False)
|
||||
if len(prevs) == self.n_classes-1:
|
||||
prevs = prevs + (1-sum(prevs),)
|
||||
if len(prevs) == self.n_classes - 1:
|
||||
prevs = prevs + (1 - sum(prevs),)
|
||||
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
|
||||
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
|
||||
|
||||
taken = 0
|
||||
indexes_sample = []
|
||||
for i, class_i in enumerate(self.classes_):
|
||||
if i == self.n_classes-1:
|
||||
for i, class_ in enumerate(self.classes_):
|
||||
if i == self.n_classes - 1:
|
||||
n_requested = size - taken
|
||||
else:
|
||||
n_requested = int(size * prevs[i])
|
||||
|
||||
n_candidates = len(self.index[class_i])
|
||||
index_sample = self.index[class_i][
|
||||
n_candidates = len(self.index[class_])
|
||||
index_sample = self.index[class_][
|
||||
np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
|
||||
] if n_requested > 0 else []
|
||||
|
||||
|
@ -90,21 +102,22 @@ class LabelledCollection:
|
|||
def sampling_from_index(self, index):
|
||||
documents = self.instances[index]
|
||||
labels = self.labels[index]
|
||||
return LabelledCollection(documents, labels, n_classes=self.n_classes)
|
||||
return LabelledCollection(documents, labels, classes_=self.classes_)
|
||||
|
||||
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||
# with temp_seed(42):
|
||||
tr_docs, te_docs, tr_labels, te_labels = \
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state)
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
|
||||
random_state=random_state)
|
||||
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
||||
|
||||
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
|
||||
dimensions=self.n_classes
|
||||
dimensions = self.n_classes
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||
yield self.sampling(sample_size, *prevs)
|
||||
|
||||
def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
|
||||
dimensions=self.n_classes
|
||||
dimensions = self.n_classes
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||
yield self.sampling_index(sample_size, *prevs)
|
||||
|
||||
|
@ -142,10 +155,10 @@ class LabelledCollection:
|
|||
else:
|
||||
nfeats = '?'
|
||||
stats_ = {'instances': ninstances,
|
||||
'type': instance_type,
|
||||
'features': nfeats,
|
||||
'classes': self.n_classes,
|
||||
'prevs': strprev(self.prevalence())}
|
||||
'type': instance_type,
|
||||
'features': nfeats,
|
||||
'classes': self.classes_,
|
||||
'prevs': strprev(self.prevalence())}
|
||||
if show:
|
||||
print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
|
||||
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
|
||||
|
@ -155,13 +168,14 @@ class LabelledCollection:
|
|||
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
|
||||
for train_index, test_index in kf.split(*self.Xy):
|
||||
train = self.sampling_from_index(train_index)
|
||||
test = self.sampling_from_index(test_index)
|
||||
test = self.sampling_from_index(test_index)
|
||||
yield train, test
|
||||
|
||||
|
||||
class Dataset:
|
||||
|
||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
||||
assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections'
|
||||
assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
|
||||
self.training = training
|
||||
self.test = test
|
||||
self.vocabulary = vocabulary
|
||||
|
@ -171,6 +185,10 @@ class Dataset:
|
|||
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
|
||||
return Dataset(*collection.split_stratified(train_prop=train_size))
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.training.classes_
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return self.training.n_classes
|
||||
|
@ -195,19 +213,15 @@ class Dataset:
|
|||
print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
|
||||
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
|
||||
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
|
||||
return {'train': tr_stats ,'test':te_stats}
|
||||
return {'train': tr_stats, 'test': te_stats}
|
||||
|
||||
@classmethod
|
||||
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
|
||||
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
|
||||
yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})')
|
||||
yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
|
||||
|
||||
|
||||
def isbinary(data):
|
||||
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
|
||||
return data.binary
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ UCI_DATASETS = ['acute.a', 'acute.b',
|
|||
'yeast']
|
||||
|
||||
|
||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
|
||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
||||
"""
|
||||
Load a Reviews dataset as a Dataset instance, as used in:
|
||||
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
|
||||
|
@ -91,7 +91,7 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle
|
|||
return data
|
||||
|
||||
|
||||
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False):
|
||||
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
||||
"""
|
||||
Load a Twitter dataset as a Dataset instance, as used in:
|
||||
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
|
@ -162,12 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
|||
return data
|
||||
|
||||
|
||||
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False):
|
||||
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
||||
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
|
||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||
|
||||
|
||||
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False):
|
||||
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
|
||||
|
||||
assert dataset_name in UCI_DATASETS, \
|
||||
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
||||
|
|
|
@ -29,13 +29,13 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
|
|||
test_documents = vectorizer.transform(dataset.test.instances)
|
||||
|
||||
if inplace:
|
||||
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes)
|
||||
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes)
|
||||
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_)
|
||||
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_)
|
||||
dataset.vocabulary = vectorizer.vocabulary_
|
||||
return dataset
|
||||
else:
|
||||
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes)
|
||||
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes)
|
||||
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_)
|
||||
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_)
|
||||
return Dataset(training, test, vectorizer.vocabulary_)
|
||||
|
||||
|
||||
|
@ -66,8 +66,8 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False):
|
|||
dataset.test.instances = Xte
|
||||
return dataset
|
||||
else:
|
||||
training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes)
|
||||
test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes)
|
||||
training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.classes_)
|
||||
test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.classes_)
|
||||
return Dataset(training, test)
|
||||
|
||||
|
||||
|
@ -100,13 +100,13 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
|
|||
test_index = indexer.transform(dataset.test.instances)
|
||||
|
||||
if inplace:
|
||||
dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes)
|
||||
dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes)
|
||||
dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.classes_)
|
||||
dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.classes_)
|
||||
dataset.vocabulary = indexer.vocabulary_
|
||||
return dataset
|
||||
else:
|
||||
training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes)
|
||||
test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes)
|
||||
training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.classes_)
|
||||
test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.classes_)
|
||||
return Dataset(training, test, indexer.vocabulary_)
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ from scipy.sparse import dok_matrix
|
|||
from tqdm import tqdm
|
||||
|
||||
|
||||
def from_text(path):
|
||||
def from_text(path, encoding='utf-8'):
|
||||
"""
|
||||
Reas a labelled colletion of documents.
|
||||
File fomart <0 or 1>\t<document>\n
|
||||
|
@ -11,7 +11,7 @@ def from_text(path):
|
|||
:return: a list of sentences, and a list of labels
|
||||
"""
|
||||
all_sentences, all_labels = [], []
|
||||
for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
|
||||
for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'):
|
||||
line = line.strip()
|
||||
if line:
|
||||
label, sentence = line.split('\t')
|
||||
|
@ -25,8 +25,8 @@ def from_text(path):
|
|||
|
||||
def from_sparse(path):
|
||||
"""
|
||||
Reas a labelled colletion of real-valued instances expressed in sparse format
|
||||
File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n
|
||||
Reads a labelled collection of real-valued instances expressed in sparse format
|
||||
File format <-1 or 0 or 1>[\s col(int):val(float)]\n
|
||||
:param path: path to the labelled collection
|
||||
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
|
||||
"""
|
||||
|
@ -56,16 +56,16 @@ def from_sparse(path):
|
|||
return X, y
|
||||
|
||||
|
||||
def from_csv(path):
|
||||
def from_csv(path, encoding='utf-8'):
|
||||
"""
|
||||
Reas a csv file in which columns are separated by ','.
|
||||
File fomart <label>,<feat1>,<feat2>,...,<featn>\n
|
||||
Reads a csv file in which columns are separated by ','.
|
||||
File format <label>,<feat1>,<feat2>,...,<featn>\n
|
||||
:param path: path to the csv file
|
||||
:return: a ndarray for the labels and a ndarray (float) for the covariates
|
||||
"""
|
||||
|
||||
X, y = [], []
|
||||
for instance in tqdm(open(path, 'rt').readlines(), desc=f'reading {path}'):
|
||||
for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'):
|
||||
yi, *xi = instance.strip().split(',')
|
||||
X.append(list(map(float,xi)))
|
||||
y.append(yi)
|
||||
|
|
|
@ -12,6 +12,7 @@ import quapy.functional as F
|
|||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
def artificial_sampling_prediction(
|
||||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
|
@ -21,8 +22,7 @@ def artificial_sampling_prediction(
|
|||
eval_budget: int = None,
|
||||
n_jobs=1,
|
||||
random_seed=42,
|
||||
verbose=True
|
||||
):
|
||||
verbose=False):
|
||||
"""
|
||||
Performs the predictions for all samples generated according to the artificial sampling protocol.
|
||||
:param model: the model in charge of generating the class prevalence estimations
|
||||
|
@ -48,6 +48,45 @@ def artificial_sampling_prediction(
|
|||
with temp_seed(random_seed):
|
||||
indexes = list(test.artificial_sampling_index_generator(sample_size, n_prevpoints, n_repetitions))
|
||||
|
||||
return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
|
||||
|
||||
|
||||
def natural_sampling_prediction(
|
||||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
sample_size,
|
||||
n_repetitions=1,
|
||||
n_jobs=1,
|
||||
random_seed=42,
|
||||
verbose=False):
|
||||
"""
|
||||
Performs the predictions for all samples generated according to the artificial sampling protocol.
|
||||
:param model: the model in charge of generating the class prevalence estimations
|
||||
:param test: the test set on which to perform arificial sampling
|
||||
:param sample_size: the size of the samples
|
||||
:param n_repetitions: the number of repetitions for each prevalence
|
||||
:param n_jobs: number of jobs to be run in parallel
|
||||
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
||||
any other random process.
|
||||
:param verbose: if True, shows a progress bar
|
||||
:return: two ndarrays of shape (m,n) with m the number of samples (n_repetitions) and n the
|
||||
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
||||
contains the the prevalence estimations
|
||||
"""
|
||||
|
||||
with temp_seed(random_seed):
|
||||
indexes = list(test.natural_sampling_index_generator(sample_size, n_repetitions))
|
||||
|
||||
return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
|
||||
|
||||
|
||||
def _predict_from_indexes(
|
||||
indexes,
|
||||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
n_jobs=1,
|
||||
verbose=False):
|
||||
|
||||
if model.aggregative: #isinstance(model, qp.method.aggregative.AggregativeQuantifier):
|
||||
# print('\tinstance of aggregative-quantifier')
|
||||
quantification_func = model.aggregate
|
||||
|
@ -88,19 +127,43 @@ def artificial_sampling_report(
|
|||
n_jobs=1,
|
||||
random_seed=42,
|
||||
error_metrics:Iterable[Union[str,Callable]]='mae',
|
||||
verbose=True):
|
||||
verbose=False):
|
||||
|
||||
true_prevs, estim_prevs = artificial_sampling_prediction(
|
||||
model, test, sample_size, n_prevpoints, n_repetitions, eval_budget, n_jobs, random_seed, verbose
|
||||
)
|
||||
return _sampling_report(true_prevs, estim_prevs, error_metrics)
|
||||
|
||||
|
||||
def natural_sampling_report(
|
||||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
sample_size,
|
||||
n_repetitions=1,
|
||||
n_jobs=1,
|
||||
random_seed=42,
|
||||
error_metrics:Iterable[Union[str,Callable]]='mae',
|
||||
verbose=False):
|
||||
|
||||
true_prevs, estim_prevs = natural_sampling_prediction(
|
||||
model, test, sample_size, n_repetitions, n_jobs, random_seed, verbose
|
||||
)
|
||||
return _sampling_report(true_prevs, estim_prevs, error_metrics)
|
||||
|
||||
|
||||
def _sampling_report(
|
||||
true_prevs,
|
||||
estim_prevs,
|
||||
error_metrics: Iterable[Union[str, Callable]] = 'mae'):
|
||||
|
||||
if isinstance(error_metrics, str):
|
||||
error_metrics=[error_metrics]
|
||||
error_metrics = [error_metrics]
|
||||
|
||||
error_names = [e if isinstance(e, str) else e.__name__ for e in error_metrics]
|
||||
error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
|
||||
assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
|
||||
|
||||
df = pd.DataFrame(columns=['true-prev', 'estim-prev']+error_names)
|
||||
true_prevs, estim_prevs = artificial_sampling_prediction(
|
||||
model, test, sample_size, n_prevpoints, n_repetitions, eval_budget, n_jobs, random_seed, verbose
|
||||
)
|
||||
df = pd.DataFrame(columns=['true-prev', 'estim-prev'] + error_names)
|
||||
for true_prev, estim_prev in zip(true_prevs, estim_prevs):
|
||||
series = {'true-prev': true_prev, 'estim-prev': estim_prev}
|
||||
for error_name, error_metric in zip(error_names, error_funcs):
|
||||
|
@ -110,7 +173,6 @@ def artificial_sampling_report(
|
|||
|
||||
return df
|
||||
|
||||
|
||||
def artificial_sampling_eval(
|
||||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
|
@ -121,7 +183,7 @@ def artificial_sampling_eval(
|
|||
n_jobs=1,
|
||||
random_seed=42,
|
||||
error_metric:Union[str,Callable]='mae',
|
||||
verbose=True):
|
||||
verbose=False):
|
||||
|
||||
if isinstance(error_metric, str):
|
||||
error_metric = qp.error.from_name(error_metric)
|
||||
|
@ -135,6 +197,28 @@ def artificial_sampling_eval(
|
|||
return error_metric(true_prevs, estim_prevs)
|
||||
|
||||
|
||||
def natural_sampling_eval(
|
||||
model: BaseQuantifier,
|
||||
test: LabelledCollection,
|
||||
sample_size,
|
||||
n_repetitions=1,
|
||||
n_jobs=1,
|
||||
random_seed=42,
|
||||
error_metric:Union[str,Callable]='mae',
|
||||
verbose=False):
|
||||
|
||||
if isinstance(error_metric, str):
|
||||
error_metric = qp.error.from_name(error_metric)
|
||||
|
||||
assert hasattr(error_metric, '__call__'), 'invalid error function'
|
||||
|
||||
true_prevs, estim_prevs = natural_sampling_prediction(
|
||||
model, test, sample_size, n_repetitions, n_jobs, random_seed, verbose
|
||||
)
|
||||
|
||||
return error_metric(true_prevs, estim_prevs)
|
||||
|
||||
|
||||
def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
|
||||
if isinstance(err, str):
|
||||
err = qp.error.from_name(err)
|
||||
|
@ -149,7 +233,7 @@ def _delayed_eval(args):
|
|||
return error(prev_true, prev_estim)
|
||||
|
||||
|
||||
def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, n_repetitions=1, verbose=True):
|
||||
def _check_num_evals(n_classes, n_prevpoints=None, eval_budget=None, n_repetitions=1, verbose=False):
|
||||
if n_prevpoints is None and eval_budget is None:
|
||||
raise ValueError('either n_prevpoints or eval_budget has to be specified')
|
||||
elif n_prevpoints is None:
|
||||
|
|
|
@ -36,12 +36,12 @@ def prevalence_linspace(n_prevalences=21, repeat=1, smooth_limits_epsilon=0.01):
|
|||
return p
|
||||
|
||||
|
||||
def prevalence_from_labels(labels, n_classes):
|
||||
def prevalence_from_labels(labels, classes_):
|
||||
if labels.ndim != 1:
|
||||
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
||||
unique, counts = np.unique(labels, return_counts=True)
|
||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||
prevalences = np.asarray([by_class[ci] for ci in range(n_classes)], dtype=np.float)
|
||||
prevalences = np.asarray([by_class[class_] for class_ in classes_], dtype=np.float)
|
||||
prevalences /= prevalences.sum()
|
||||
return prevalences
|
||||
|
||||
|
@ -51,7 +51,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
|
|||
raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities')
|
||||
if binarize:
|
||||
predictions = np.argmax(posteriors, axis=-1)
|
||||
return prevalence_from_labels(predictions, n_classes=posteriors.shape[1])
|
||||
return prevalence_from_labels(predictions, np.arange(posteriors.shape[1]))
|
||||
else:
|
||||
prevalences = posteriors.mean(axis=0)
|
||||
prevalences /= prevalences.sum()
|
||||
|
|
|
@ -3,21 +3,31 @@ from . import base
|
|||
from . import meta
|
||||
from . import non_aggregative
|
||||
|
||||
EXPLICIT_LOSS_MINIMIZATION_METHODS = {
|
||||
aggregative.ELM,
|
||||
aggregative.SVMQ,
|
||||
aggregative.SVMAE,
|
||||
aggregative.SVMKLD,
|
||||
aggregative.SVMRAE,
|
||||
aggregative.SVMNKLD
|
||||
}
|
||||
|
||||
AGGREGATIVE_METHODS = {
|
||||
aggregative.CC,
|
||||
aggregative.ACC,
|
||||
aggregative.PCC,
|
||||
aggregative.PACC,
|
||||
aggregative.ELM,
|
||||
aggregative.EMQ,
|
||||
aggregative.HDy
|
||||
}
|
||||
} | EXPLICIT_LOSS_MINIMIZATION_METHODS
|
||||
|
||||
|
||||
NON_AGGREGATIVE_METHODS = {
|
||||
non_aggregative.MaximumLikelihoodPrevalenceEstimation
|
||||
}
|
||||
|
||||
META_METHODS = {
|
||||
meta.Ensemble,
|
||||
meta.QuaNet
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from abc import abstractmethod
|
||||
from copy import deepcopy
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.base import BaseEstimator
|
||||
|
@ -8,6 +9,7 @@ from sklearn.calibration import CalibratedClassifierCV
|
|||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from tqdm import tqdm
|
||||
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
from quapy.classification.svmperf import SVMperf
|
||||
|
@ -43,7 +45,7 @@ class AggregativeQuantifier(BaseQuantifier):
|
|||
return self.aggregate(classif_predictions)
|
||||
|
||||
@abstractmethod
|
||||
def aggregate(self, classif_predictions:np.ndarray): ...
|
||||
def aggregate(self, classif_predictions: np.ndarray): ...
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.learner.get_params()
|
||||
|
@ -53,10 +55,10 @@ class AggregativeQuantifier(BaseQuantifier):
|
|||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return len(self.classes)
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def classes(self):
|
||||
def classes_(self):
|
||||
return self.learner.classes_
|
||||
|
||||
@property
|
||||
|
@ -84,7 +86,7 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
|||
|
||||
def set_params(self, **parameters):
|
||||
if isinstance(self.learner, CalibratedClassifierCV):
|
||||
parameters = {'base_estimator__'+k:v for k,v in parameters.items()}
|
||||
parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
|
||||
self.learner.set_params(**parameters)
|
||||
|
||||
@property
|
||||
|
@ -98,7 +100,7 @@ def training_helper(learner,
|
|||
data: LabelledCollection,
|
||||
fit_learner: bool = True,
|
||||
ensure_probabilistic=False,
|
||||
val_split:Union[LabelledCollection, float]=None):
|
||||
val_split: Union[LabelledCollection, float] = None):
|
||||
"""
|
||||
Training procedure common to all Aggregative Quantifiers.
|
||||
:param learner: the learner to be fit
|
||||
|
@ -122,13 +124,14 @@ def training_helper(learner,
|
|||
if isinstance(val_split, float):
|
||||
if not (0 < val_split < 1):
|
||||
raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
|
||||
train, unused = data.split_stratified(train_prop=1-val_split)
|
||||
elif val_split.__class__.__name__ == LabelledCollection.__name__: #isinstance(val_split, LabelledCollection):
|
||||
train, unused = data.split_stratified(train_prop=1 - val_split)
|
||||
elif val_split.__class__.__name__ == LabelledCollection.__name__: # isinstance(val_split, LabelledCollection):
|
||||
train = data
|
||||
unused = val_split
|
||||
else:
|
||||
raise ValueError(f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split '
|
||||
'proportion, or a LabelledCollection indicating the validation split')
|
||||
raise ValueError(
|
||||
f'param "val_split" ({type(val_split)}) not understood; use either a float indicating the split '
|
||||
'proportion, or a LabelledCollection indicating the validation split')
|
||||
else:
|
||||
train, unused = data, None
|
||||
|
||||
|
@ -153,7 +156,7 @@ class CC(AggregativeQuantifier):
|
|||
attributed each of the classes in order to compute class prevalence estimates.
|
||||
"""
|
||||
|
||||
def __init__(self, learner:BaseEstimator):
|
||||
def __init__(self, learner: BaseEstimator):
|
||||
self.learner = learner
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
|
@ -167,16 +170,16 @@ class CC(AggregativeQuantifier):
|
|||
return self
|
||||
|
||||
def aggregate(self, classif_predictions):
|
||||
return F.prevalence_from_labels(classif_predictions, self.n_classes)
|
||||
return F.prevalence_from_labels(classif_predictions, self.classes_)
|
||||
|
||||
|
||||
class ACC(AggregativeQuantifier):
|
||||
|
||||
def __init__(self, learner:BaseEstimator, val_split=0.4):
|
||||
def __init__(self, learner: BaseEstimator, val_split=0.4):
|
||||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection]=None):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||
"""
|
||||
Trains a ACC quantifier
|
||||
:param data: the training set
|
||||
|
@ -262,7 +265,7 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=None):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
|
||||
"""
|
||||
Trains a PACC quantifier
|
||||
:param data: the training set
|
||||
|
@ -294,7 +297,8 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
y_ = np.vstack(y_)
|
||||
|
||||
# fit the learner on all data
|
||||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True, val_split=None)
|
||||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True,
|
||||
val_split=None)
|
||||
|
||||
else:
|
||||
self.learner, val_data = training_helper(
|
||||
|
@ -307,8 +311,8 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
confusion = np.empty(shape=(data.n_classes, data.n_classes))
|
||||
for yi in range(data.n_classes):
|
||||
confusion[yi] = y_[y==yi].mean(axis=0)
|
||||
for i,class_ in enumerate(data.classes_):
|
||||
confusion[i] = y_[y == class_].mean(axis=0)
|
||||
|
||||
self.Pte_cond_estim_ = confusion.T
|
||||
|
||||
|
@ -338,7 +342,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
|||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
self.learner, _ = training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.n_classes)
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||
|
@ -366,7 +370,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
|||
# M-step:
|
||||
qs = ps.mean(axis=0)
|
||||
|
||||
if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s>10:
|
||||
if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s > 10:
|
||||
converged = True
|
||||
|
||||
qs_prev_ = qs
|
||||
|
@ -389,7 +393,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
self.learner = learner
|
||||
self.val_split = val_split
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection] = None):
|
||||
"""
|
||||
Trains a HDy quantifier
|
||||
:param data: the training set
|
||||
|
@ -405,13 +409,15 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
self._check_binary(data, self.__class__.__name__)
|
||||
self.learner, validation = training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
||||
Px = self.posterior_probabilities(validation.instances)[:,1] # takes only the P(y=+1|x)
|
||||
self.Pxy1 = Px[validation.labels == 1]
|
||||
self.Pxy0 = Px[validation.labels == 0]
|
||||
Px = self.posterior_probabilities(validation.instances)[:, 1] # takes only the P(y=+1|x)
|
||||
self.Pxy1 = Px[validation.labels == self.learner.classes_[1]]
|
||||
self.Pxy0 = Px[validation.labels == self.learner.classes_[0]]
|
||||
# pre-compute the histogram for positive and negative examples
|
||||
self.bins = np.linspace(10, 110, 11, dtype=int) #[10, 20, 30, ..., 100, 110]
|
||||
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins}
|
||||
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in self.bins}
|
||||
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
||||
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
|
||||
self.bins}
|
||||
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
|
||||
self.bins}
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_posteriors):
|
||||
|
@ -419,12 +425,12 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
||||
# (González-Castro, et al., 2013).
|
||||
|
||||
Px = classif_posteriors[:,1] # takes only the P(y=+1|x)
|
||||
Px = classif_posteriors[:, 1] # takes only the P(y=+1|x)
|
||||
|
||||
prev_estimations = []
|
||||
#for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
||||
#Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
|
||||
#Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
|
||||
# for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110]
|
||||
# Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)
|
||||
# Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)
|
||||
for bins in self.bins:
|
||||
Pxy0_density = self.Pxy0_density[bins]
|
||||
Pxy1_density = self.Pxy1_density[bins]
|
||||
|
@ -433,14 +439,14 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
|
||||
prev_selected, min_dist = None, None
|
||||
for prev in F.prevalence_linspace(n_prevalences=100, repeat=1, smooth_limits_epsilon=0.0):
|
||||
Px_train = prev*Pxy1_density + (1 - prev)*Pxy0_density
|
||||
Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density
|
||||
hdy = F.HellingerDistance(Px_train, Px_test)
|
||||
if prev_selected is None or hdy < min_dist:
|
||||
prev_selected, min_dist = prev, hdy
|
||||
prev_estimations.append(prev_selected)
|
||||
|
||||
pos_class_prev = np.median(prev_estimations)
|
||||
return np.asarray([1-pos_class_prev, pos_class_prev])
|
||||
class1_prev = np.median(prev_estimations)
|
||||
return np.asarray([1 - class1_prev, class1_prev])
|
||||
|
||||
|
||||
class ELM(AggregativeQuantifier, BinaryQuantifier):
|
||||
|
@ -457,8 +463,8 @@ class ELM(AggregativeQuantifier, BinaryQuantifier):
|
|||
self.learner.fit(data.instances, data.labels)
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_predictions:np.ndarray):
|
||||
return F.prevalence_from_labels(classif_predictions, self.learner.n_classes_)
|
||||
def aggregate(self, classif_predictions: np.ndarray):
|
||||
return F.prevalence_from_labels(classif_predictions, self.classes_)
|
||||
|
||||
def classify(self, X, y=None):
|
||||
return self.learner.predict(X)
|
||||
|
@ -470,6 +476,7 @@ class SVMQ(ELM):
|
|||
Quantification-oriented learning based on reliable classifiers.
|
||||
Pattern Recognition, 48(2):591–604.
|
||||
"""
|
||||
|
||||
def __init__(self, svmperf_base=None, **kwargs):
|
||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||
|
||||
|
@ -480,6 +487,7 @@ class SVMKLD(ELM):
|
|||
Optimizing text quantifiers for multivariate loss functions.
|
||||
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
|
||||
"""
|
||||
|
||||
def __init__(self, svmperf_base=None, **kwargs):
|
||||
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
||||
|
||||
|
@ -490,6 +498,7 @@ class SVMNKLD(ELM):
|
|||
Optimizing text quantifiers for multivariate loss functions.
|
||||
ACM Transactions on Knowledge Discovery and Data, 9(4):Article 27.
|
||||
"""
|
||||
|
||||
def __init__(self, svmperf_base=None, **kwargs):
|
||||
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
||||
|
||||
|
@ -531,7 +540,7 @@ class OneVsAll(AggregativeQuantifier):
|
|||
f'{self.__class__.__name__} expect non-binary data'
|
||||
assert isinstance(self.binary_quantifier, BaseQuantifier), \
|
||||
f'{self.binary_quantifier} does not seem to be a Quantifier'
|
||||
assert fit_learner==True, 'fit_learner must be True'
|
||||
assert fit_learner == True, 'fit_learner must be True'
|
||||
|
||||
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
|
||||
self.__parallel(self._delayed_binary_fit, data)
|
||||
|
@ -559,11 +568,11 @@ class OneVsAll(AggregativeQuantifier):
|
|||
|
||||
def aggregate(self, classif_predictions_bin):
|
||||
if self.probabilistic:
|
||||
assert classif_predictions_bin.shape[1]==self.n_classes and classif_predictions_bin.shape[2]==2, \
|
||||
assert classif_predictions_bin.shape[1] == self.n_classes and classif_predictions_bin.shape[2] == 2, \
|
||||
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of posterior ' \
|
||||
'probabilities (2 dimensions) for each document (row) and class (columns)'
|
||||
else:
|
||||
assert set(np.unique(classif_predictions_bin)).issubset({0,1}), \
|
||||
assert set(np.unique(classif_predictions_bin)).issubset({0, 1}), \
|
||||
'param classif_predictions_bin does not seem to be a valid matrix (ndarray) of binary ' \
|
||||
'predictions for each document (row) and class (columns)'
|
||||
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions_bin)
|
||||
|
@ -581,12 +590,12 @@ class OneVsAll(AggregativeQuantifier):
|
|||
# some quantifiers (in particular, ELM-based ones) cannot be run with multiprocess, since the temp dir they
|
||||
# create during the fit will be removed and be no longer available for the predict...
|
||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
delayed(func)(c, *args, **kwargs) for c in self.classes
|
||||
delayed(func)(c, *args, **kwargs) for c in self.classes_
|
||||
)
|
||||
)
|
||||
|
||||
@property
|
||||
def classes(self):
|
||||
def classes_(self):
|
||||
return sorted(self.dict_binary_quantifiers.keys())
|
||||
|
||||
def set_params(self, **parameters):
|
||||
|
@ -606,7 +615,7 @@ class OneVsAll(AggregativeQuantifier):
|
|||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
||||
|
||||
def _delayed_binary_fit(self, c, data):
|
||||
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||
bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
|
||||
self.dict_binary_quantifiers[c].fit(bindata)
|
||||
|
||||
@property
|
||||
|
@ -616,9 +625,3 @@ class OneVsAll(AggregativeQuantifier):
|
|||
@property
|
||||
def probabilistic(self):
|
||||
return self.binary_quantifier.probabilistic
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -19,6 +19,10 @@ class BaseQuantifier(metaclass=ABCMeta):
|
|||
@abstractmethod
|
||||
def get_params(self, deep=True): ...
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def classes_(self): ...
|
||||
|
||||
# these methods allows meta-learners to reimplement the decision based on their constituents, and not
|
||||
# based on class structure
|
||||
@property
|
||||
|
|
|
@ -1,28 +1,32 @@
|
|||
from copy import deepcopy
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import f1_score, make_scorer, accuracy_score
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_predict
|
||||
from tqdm import tqdm
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_predict
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy import functional as F
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.evaluation import evaluate
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from . import neural
|
||||
from .base import BaseQuantifier
|
||||
from quapy.method.aggregative import CC, ACC, PCC, PACC, HDy, EMQ
|
||||
|
||||
QuaNet = neural.QuaNetTrainer
|
||||
try:
|
||||
from . import neural
|
||||
except ModuleNotFoundError:
|
||||
neural = None
|
||||
from .base import BaseQuantifier
|
||||
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
|
||||
|
||||
if neural:
|
||||
QuaNet = neural.QuaNetTrainer
|
||||
else:
|
||||
QuaNet = "QuaNet is not available due to missing torch package"
|
||||
|
||||
|
||||
class Ensemble(BaseQuantifier):
|
||||
|
||||
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES
|
||||
|
||||
"""
|
||||
|
@ -65,9 +69,9 @@ class Ensemble(BaseQuantifier):
|
|||
if self.verbose:
|
||||
print('[Ensemble]' + msg)
|
||||
|
||||
def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float]=None):
|
||||
def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None):
|
||||
self.sout('Fit')
|
||||
if self.policy=='ds' and not data.binary:
|
||||
if self.policy == 'ds' and not data.binary:
|
||||
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
|
||||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
|
@ -132,7 +136,7 @@ class Ensemble(BaseQuantifier):
|
|||
tests = [m[3] for m in self.ensemble]
|
||||
scores = []
|
||||
for i, model in enumerate(self.ensemble):
|
||||
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
|
||||
scores.append(evaluate(model[0], tests[:i] + tests[i + 1:], error, self.n_jobs))
|
||||
order = np.argsort(scores)
|
||||
|
||||
self.ensemble = _select_k(self.ensemble, order, k=self.red_size)
|
||||
|
@ -168,7 +172,7 @@ class Ensemble(BaseQuantifier):
|
|||
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
|
||||
|
||||
optim = GridSearchCV(
|
||||
lr_base, param_grid={'C': np.logspace(-4,4,9)}, cv=5, n_jobs=self.n_jobs, refit=True
|
||||
lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
|
||||
).fit(X, y)
|
||||
|
||||
posteriors = cross_val_predict(
|
||||
|
@ -186,6 +190,10 @@ class Ensemble(BaseQuantifier):
|
|||
order = np.argsort(dist)
|
||||
return _select_k(predictions, order, k=self.red_size)
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.base_quantifier.classes_
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return self.base_quantifier.binary
|
||||
|
@ -200,8 +208,8 @@ class Ensemble(BaseQuantifier):
|
|||
|
||||
|
||||
def get_probability_distribution(posterior_probabilities, bins=8):
|
||||
assert posterior_probabilities.shape[1]==2, 'the posterior probabilities do not seem to be for a binary problem'
|
||||
posterior_probabilities = posterior_probabilities[:,1] # take the positive posteriors only
|
||||
assert posterior_probabilities.shape[1] == 2, 'the posterior probabilities do not seem to be for a binary problem'
|
||||
posterior_probabilities = posterior_probabilities[:, 1] # take the positive posteriors only
|
||||
distribution, _ = np.histogram(posterior_probabilities, bins=bins, range=(0, 1), density=True)
|
||||
return distribution
|
||||
|
||||
|
@ -219,7 +227,7 @@ def _delayed_new_instance(args):
|
|||
if val_split is not None:
|
||||
if isinstance(val_split, float):
|
||||
assert 0 < val_split < 1, 'val_split should be in (0,1)'
|
||||
data, val_split = data.split_stratified(train_prop=1-val_split)
|
||||
data, val_split = data.split_stratified(train_prop=1 - val_split)
|
||||
|
||||
sample_index = data.sampling_index(sample_size, *prev)
|
||||
sample = data.sampling_from_index(sample_index)
|
||||
|
@ -251,7 +259,7 @@ def _draw_simplex(ndim, min_val, max_trials=100):
|
|||
:return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex
|
||||
and R is the simplex subset containing dimensions lower than min_val
|
||||
"""
|
||||
if min_val >= 1/ndim:
|
||||
if min_val >= 1 / ndim:
|
||||
raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that '
|
||||
f'all its values are >={min_val} (try with a larger value for min_pos)')
|
||||
trials = 0
|
||||
|
@ -296,14 +304,15 @@ def _check_error(error):
|
|||
f'the name of an error function in {qp.error.ERROR_NAMES}')
|
||||
|
||||
|
||||
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel:dict=None, **kwargs):
|
||||
if optim is not None:
|
||||
if param_grid is None:
|
||||
raise ValueError(f'param_grid is None but optim was requested.')
|
||||
if param_model_sel is None:
|
||||
raise ValueError(f'param_model_sel is None but optim was requested.')
|
||||
error = _check_error(optim)
|
||||
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
|
||||
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None,
|
||||
**kwargs):
|
||||
if optim is not None:
|
||||
if param_grid is None:
|
||||
raise ValueError(f'param_grid is None but optim was requested.')
|
||||
if param_model_sel is None:
|
||||
raise ValueError(f'param_model_sel is None but optim was requested.')
|
||||
error = _check_error(optim)
|
||||
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
|
||||
|
||||
|
||||
def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||
|
@ -323,4 +332,4 @@ def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
|||
|
||||
|
||||
def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)
|
||||
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)
|
||||
|
|
|
@ -58,6 +58,7 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
self.device = torch.device(device)
|
||||
|
||||
self.__check_params_colision(self.quanet_params, self.learner.get_params())
|
||||
self._classes_ = None
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
"""
|
||||
|
@ -67,6 +68,7 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
:param fit_learner: if true, trains the classifier on a split containing 40% of the data
|
||||
:return: self
|
||||
"""
|
||||
self._classes_ = data.classes_
|
||||
classifier_data, unused_data = data.split_stratified(0.4)
|
||||
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
|
||||
|
||||
|
@ -256,6 +258,10 @@ class QuaNetTrainer(BaseQuantifier):
|
|||
import shutil
|
||||
shutil.rmtree(self.checkpointdir, ignore_errors=True)
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self._classes_
|
||||
|
||||
|
||||
def mae_loss(output, target):
|
||||
return torch.mean(torch.abs(output - target))
|
||||
|
|
|
@ -2,18 +2,22 @@ from quapy.data import LabelledCollection
|
|||
from .base import BaseQuantifier
|
||||
|
||||
|
||||
|
||||
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
self._classes_ = None
|
||||
|
||||
def fit(self, data: LabelledCollection, *args):
|
||||
self._classes_ = data.classes_
|
||||
self.estimated_prevalence = data.prevalence()
|
||||
|
||||
def quantify(self, documents, *args):
|
||||
return self.estimated_prevalence
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self._classes_
|
||||
|
||||
def get_params(self):
|
||||
pass
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ from copy import deepcopy
|
|||
from typing import Union, Callable
|
||||
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
from quapy.data.base import LabelledCollection
|
||||
from quapy.evaluation import artificial_sampling_prediction
|
||||
from quapy.method.aggregative import BaseQuantifier
|
||||
|
@ -80,7 +79,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
return training, validation
|
||||
elif isinstance(validation, float):
|
||||
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
||||
training, validation = training.split_stratified(train_prop=1-validation)
|
||||
training, validation = training.split_stratified(train_prop=1 - validation)
|
||||
return training, validation
|
||||
else:
|
||||
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
||||
|
@ -97,7 +96,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
|
||||
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
|
||||
|
||||
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float]=None):
|
||||
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float] = None):
|
||||
"""
|
||||
:param training: the training set on which to optimize the hyperparameters
|
||||
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or
|
||||
|
@ -118,6 +117,7 @@ class GridSearchQ(BaseQuantifier):
|
|||
def handler(signum, frame):
|
||||
self.sout('timeout reached')
|
||||
raise TimeoutError()
|
||||
|
||||
signal.signal(signal.SIGALRM, handler)
|
||||
|
||||
self.sout(f'starting optimization with n_jobs={n_jobs}')
|
||||
|
@ -175,6 +175,10 @@ class GridSearchQ(BaseQuantifier):
|
|||
def quantify(self, instances):
|
||||
return self.best_model_.quantify(instances)
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.best_model_.classes_
|
||||
|
||||
def set_params(self, **parameters):
|
||||
self.param_grid = parameters
|
||||
|
||||
|
@ -185,4 +189,3 @@ class GridSearchQ(BaseQuantifier):
|
|||
if hasattr(self, 'best_model_'):
|
||||
return self.best_model_
|
||||
raise ValueError('best_model called before fit')
|
||||
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
import pytest
|
||||
|
||||
def test_import():
|
||||
import quapy as qp
|
||||
assert qp.__version__ is not None
|
|
@ -0,0 +1,43 @@
|
|||
import pytest
|
||||
|
||||
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
|
||||
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, fetch_reviews, fetch_twitter, fetch_UCIDataset
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
|
||||
def test_fetch_reviews(dataset_name):
|
||||
dataset = fetch_reviews(dataset_name)
|
||||
print(f'Dataset {dataset_name}')
|
||||
print('Training set stats')
|
||||
dataset.training.stats()
|
||||
print('Test set stats')
|
||||
dataset.test.stats()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN)
|
||||
def test_fetch_twitter(dataset_name):
|
||||
try:
|
||||
dataset = fetch_twitter(dataset_name)
|
||||
except ValueError as ve:
|
||||
if dataset_name == 'semeval' and ve.args[0].startswith(
|
||||
'dataset "semeval" can only be used for model selection.'):
|
||||
dataset = fetch_twitter(dataset_name, for_model_selection=True)
|
||||
print(f'Dataset {dataset_name}')
|
||||
print('Training set stats')
|
||||
dataset.training.stats()
|
||||
print('Test set stats')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset_name', UCI_DATASETS)
|
||||
def test_fetch_UCIDataset(dataset_name):
|
||||
try:
|
||||
dataset = fetch_UCIDataset(dataset_name)
|
||||
except FileNotFoundError as fnfe:
|
||||
if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
|
||||
'If this is the first time you attempt to load this dataset') > 0:
|
||||
print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.')
|
||||
return
|
||||
print(f'Dataset {dataset_name}')
|
||||
print('Training set stats')
|
||||
dataset.training.stats()
|
||||
print('Test set stats')
|
|
@ -0,0 +1,185 @@
|
|||
import numpy
|
||||
import pytest
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import Dataset, LabelledCollection
|
||||
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS, EXPLICIT_LOSS_MINIMIZATION_METHODS
|
||||
from quapy.method.aggregative import ACC, PACC, HDy
|
||||
from quapy.method.meta import Ensemble
|
||||
|
||||
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr'), id='hcr'),
|
||||
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
|
||||
|
||||
learners = [LogisticRegression, LinearSVC]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset', datasets)
|
||||
@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS))
|
||||
@pytest.mark.parametrize('learner', learners)
|
||||
def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
|
||||
model = aggregative_method(learner())
|
||||
|
||||
if model.binary and not dataset.binary:
|
||||
print(f'skipping the test of binary model {type(model)} on non-binary dataset {dataset}')
|
||||
return
|
||||
|
||||
model.fit(dataset.training)
|
||||
|
||||
estim_prevalences = model.quantify(dataset.test.instances)
|
||||
|
||||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset', datasets)
|
||||
@pytest.mark.parametrize('elm_method', EXPLICIT_LOSS_MINIMIZATION_METHODS)
|
||||
def test_elm_methods(dataset: Dataset, elm_method):
|
||||
try:
|
||||
model = elm_method()
|
||||
except AssertionError as ae:
|
||||
if ae.args[0].find('does not seem to point to a valid path') > 0:
|
||||
print('Missing SVMperf binary program, skipping test')
|
||||
return
|
||||
|
||||
if model.binary and not dataset.binary:
|
||||
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
|
||||
return
|
||||
|
||||
model.fit(dataset.training)
|
||||
|
||||
estim_prevalences = model.quantify(dataset.test.instances)
|
||||
|
||||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
|
||||
|
||||
@pytest.mark.parametrize('dataset', datasets)
|
||||
@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS)
|
||||
def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
|
||||
model = non_aggregative_method()
|
||||
|
||||
if model.binary and not dataset.binary:
|
||||
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
|
||||
return
|
||||
|
||||
model.fit(dataset.training)
|
||||
|
||||
estim_prevalences = model.quantify(dataset.test.instances)
|
||||
|
||||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
|
||||
|
||||
@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS))
|
||||
@pytest.mark.parametrize('learner', learners)
|
||||
@pytest.mark.parametrize('dataset', datasets)
|
||||
@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
|
||||
def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
|
||||
qp.environ['SAMPLE_SIZE'] = len(dataset.training)
|
||||
model = Ensemble(quantifier=base_method(learner()), size=5, policy=policy, n_jobs=-1)
|
||||
if model.binary and not dataset.binary:
|
||||
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
|
||||
return
|
||||
|
||||
model.fit(dataset.training)
|
||||
|
||||
estim_prevalences = model.quantify(dataset.test.instances)
|
||||
|
||||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
|
||||
|
||||
def test_quanet_method():
|
||||
try:
|
||||
import quapy.classification.neural
|
||||
except ModuleNotFoundError:
|
||||
print('skipping QuaNet test due to missing torch package')
|
||||
return
|
||||
|
||||
dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
|
||||
dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()),
|
||||
dataset.test.sampling(100, *dataset.test.prevalence()))
|
||||
qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
|
||||
|
||||
from quapy.classification.neural import CNNnet
|
||||
cnn = CNNnet(dataset.vocabulary_size, dataset.training.n_classes)
|
||||
|
||||
from quapy.classification.neural import NeuralClassifierTrainer
|
||||
learner = NeuralClassifierTrainer(cnn, device='cuda')
|
||||
|
||||
from quapy.method.meta import QuaNet
|
||||
model = QuaNet(learner, sample_size=len(dataset.training), device='cuda')
|
||||
|
||||
if model.binary and not dataset.binary:
|
||||
print(f'skipping the test of binary model {model} on non-binary dataset {dataset}')
|
||||
return
|
||||
|
||||
model.fit(dataset.training)
|
||||
|
||||
estim_prevalences = model.quantify(dataset.test.instances)
|
||||
|
||||
true_prevalences = dataset.test.prevalence()
|
||||
error = qp.error.mae(true_prevalences, estim_prevalences)
|
||||
|
||||
assert type(error) == numpy.float64
|
||||
|
||||
|
||||
def models_to_test_for_str_label_names():
|
||||
models = list()
|
||||
learner = LogisticRegression
|
||||
for method in AGGREGATIVE_METHODS.difference(EXPLICIT_LOSS_MINIMIZATION_METHODS):
|
||||
models.append(method(learner()))
|
||||
for method in NON_AGGREGATIVE_METHODS:
|
||||
models.append(method())
|
||||
return models
|
||||
|
||||
|
||||
@pytest.mark.parametrize('model', models_to_test_for_str_label_names())
|
||||
def test_str_label_names(model):
|
||||
if type(model) in {ACC, PACC, HDy}:
|
||||
print(
|
||||
f'skipping the test of binary model {type(model)} because it currently does not support random seed control.')
|
||||
return
|
||||
|
||||
dataset = qp.datasets.fetch_reviews('imdb', pickle=True)
|
||||
dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()),
|
||||
dataset.test.sampling(1000, *dataset.test.prevalence()))
|
||||
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
|
||||
|
||||
model.fit(dataset.training)
|
||||
|
||||
int_estim_prevalences = model.quantify(dataset.test.instances)
|
||||
true_prevalences = dataset.test.prevalence()
|
||||
|
||||
error = qp.error.mae(true_prevalences, int_estim_prevalences)
|
||||
assert type(error) == numpy.float64
|
||||
|
||||
dataset_str = Dataset(LabelledCollection(dataset.training.instances,
|
||||
['one' if label == 1 else 'zero' for label in dataset.training.labels]),
|
||||
LabelledCollection(dataset.test.instances,
|
||||
['one' if label == 1 else 'zero' for label in dataset.test.labels]))
|
||||
|
||||
model.fit(dataset_str.training)
|
||||
|
||||
str_estim_prevalences = model.quantify(dataset_str.test.instances)
|
||||
true_prevalences = dataset_str.test.prevalence()
|
||||
|
||||
error = qp.error.mae(true_prevalences, str_estim_prevalences)
|
||||
assert type(error) == numpy.float64
|
||||
|
||||
print(true_prevalences)
|
||||
print(int_estim_prevalences)
|
||||
print(str_estim_prevalences)
|
||||
|
||||
numpy.testing.assert_almost_equal(int_estim_prevalences[1],
|
||||
str_estim_prevalences[list(model.classes_).index('one')])
|
|
@ -0,0 +1,164 @@
|
|||
from setuptools import setup, find_packages
|
||||
import pathlib
|
||||
|
||||
here = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
long_description = (here / 'README.md').read_text(encoding='utf-8')
|
||||
|
||||
|
||||
def get_version(rel_path):
|
||||
init_content = (here / rel_path).read_text(encoding='utf-8')
|
||||
for line in init_content.split('\n'):
|
||||
if line.startswith('__version__'):
|
||||
delim = '"' if '"' in line else "'"
|
||||
return line.split(delim)[1]
|
||||
else:
|
||||
raise RuntimeError("Unable to find version string.")
|
||||
# Arguments marked as "Required" below must be included for upload to PyPI.
|
||||
# Fields marked as "Optional" may be commented out.
|
||||
|
||||
setup(
|
||||
# This is the name of your project. The first time you publish this
|
||||
# package, this name will be registered for you. It will determine how
|
||||
# users can install this project, e.g.:
|
||||
#
|
||||
# $ pip install sampleproject
|
||||
#
|
||||
# And where it will live on PyPI: https://pypi.org/project/sampleproject/
|
||||
#
|
||||
# There are some restrictions on what makes a valid project name
|
||||
# specification here:
|
||||
# https://packaging.python.org/specifications/core-metadata/#name
|
||||
name='QuaPy', # Required
|
||||
|
||||
# Versions should comply with PEP 440:
|
||||
# https://www.python.org/dev/peps/pep-0440/
|
||||
#
|
||||
# For a discussion on single-sourcing the version across setup.py and the
|
||||
# project code, see
|
||||
# https://packaging.python.org/en/latest/single_source_version.html
|
||||
version=get_version("quapy/__init__.py"), # Required
|
||||
|
||||
# This is a one-line description or tagline of what your project does. This
|
||||
# corresponds to the "Summary" metadata field:
|
||||
# https://packaging.python.org/specifications/core-metadata/#summary
|
||||
description='QuaPy: a framework for Quantification in Python', # Optional
|
||||
|
||||
# This is an optional longer description of your project that represents
|
||||
# the body of text which users will see when they visit PyPI.
|
||||
#
|
||||
# Often, this is the same as your README, so you can just read it in from
|
||||
# that file directly (as we have already done above)
|
||||
#
|
||||
# This field corresponds to the "Description" metadata field:
|
||||
# https://packaging.python.org/specifications/core-metadata/#description-optional
|
||||
long_description=long_description, # Optional
|
||||
|
||||
# Denotes that our long_description is in Markdown; valid values are
|
||||
# text/plain, text/x-rst, and text/markdown
|
||||
#
|
||||
# Optional if long_description is written in reStructuredText (rst) but
|
||||
# required for plain-text or Markdown; if unspecified, "applications should
|
||||
# attempt to render [the long_description] as text/x-rst; charset=UTF-8 and
|
||||
# fall back to text/plain if it is not valid rst" (see link below)
|
||||
#
|
||||
# This field corresponds to the "Description-Content-Type" metadata field:
|
||||
# https://packaging.python.org/specifications/core-metadata/#description-content-type-optional
|
||||
long_description_content_type='text/markdown', # Optional (see note above)
|
||||
|
||||
# This should be a valid link to your project's main homepage.
|
||||
#
|
||||
# This field corresponds to the "Home-Page" metadata field:
|
||||
# https://packaging.python.org/specifications/core-metadata/#home-page-optional
|
||||
url='https://github.com/HLT-ISTI/QuaPy', # Optional
|
||||
|
||||
maintainer='Alejandro Moreo',
|
||||
|
||||
maintainer_email='alejandro.moreo@isti.cnr.it',
|
||||
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta',
|
||||
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'Programming Language :: Python',
|
||||
'Topic :: Software Development',
|
||||
'Topic :: Scientific/Engineering',
|
||||
|
||||
'License :: OSI Approved :: BSD License',
|
||||
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3 :: Only',
|
||||
],
|
||||
|
||||
keywords='machine learning, quantification, classification, prevalence estimation, priors estimate',
|
||||
|
||||
# When your source code is in a subdirectory under the project root, e.g.
|
||||
# `src/`, it is necessary to specify the `package_dir` argument.
|
||||
#package_dir={'': 'src'}, # Optional
|
||||
|
||||
# You can just specify package directories manually here if your project is
|
||||
# simple. Or you can use find_packages().
|
||||
#
|
||||
# Alternatively, if you just want to distribute a single Python file, use
|
||||
# the `py_modules` argument instead as follows, which will expect a file
|
||||
# called `my_module.py` to exist:
|
||||
#
|
||||
# py_modules=["my_module"],
|
||||
#
|
||||
packages=find_packages(include=['quapy', 'quapy.*']), # Required
|
||||
|
||||
python_requires='>=3.6, <4',
|
||||
|
||||
install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib'],
|
||||
|
||||
# List additional groups of dependencies here (e.g. development
|
||||
# dependencies). Users will be able to install these using the "extras"
|
||||
# syntax, for example:
|
||||
#
|
||||
# $ pip install sampleproject[dev]
|
||||
#
|
||||
# Similar to `install_requires` above, these must be valid existing
|
||||
# projects.
|
||||
# extras_require={ # Optional
|
||||
# 'dev': ['check-manifest'],
|
||||
# 'test': ['coverage'],
|
||||
# },
|
||||
|
||||
# If there are data files included in your packages that need to be
|
||||
# installed, specify them here.
|
||||
# package_data={ # Optional
|
||||
# 'sample': ['package_data.dat'],
|
||||
# },
|
||||
|
||||
# Although 'package_data' is the preferred approach, in some case you may
|
||||
# need to place data files outside of your packages. See:
|
||||
# http://docs.python.org/distutils/setupscript.html#installing-additional-files
|
||||
#
|
||||
# In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
|
||||
# data_files=[('my_data', ['data/data_file'])], # Optional
|
||||
|
||||
# To provide executable scripts, use entry points in preference to the
|
||||
# "scripts" keyword. Entry points provide cross-platform support and allow
|
||||
# `pip` to create the appropriate form of executable for the target
|
||||
# platform.
|
||||
#
|
||||
# For example, the following would provide a command called `sample` which
|
||||
# executes the function `main` from this package when invoked:
|
||||
# entry_points={ # Optional
|
||||
# 'console_scripts': [
|
||||
# 'sample=sample:main',
|
||||
# ],
|
||||
# },
|
||||
|
||||
project_urls={ # Optional
|
||||
'Contributors': 'https://github.com/HLT-ISTI/QuaPy/graphs/contributors',
|
||||
'Bug Reports': 'https://github.com/HLT-ISTI/QuaPy/issues',
|
||||
'Documentation': 'https://github.com/HLT-ISTI/QuaPy/wiki',
|
||||
'Source': 'https://github.com/HLT-ISTI/QuaPy/',
|
||||
},
|
||||
)
|
Loading…
Reference in New Issue