passing pytests

This commit is contained in:
Alejandro Moreo Fernandez 2024-01-29 09:43:29 +01:00
parent e6dcfbced1
commit 2f2e48d86a
6 changed files with 79 additions and 35 deletions

View File

@ -1,6 +1,10 @@
Change Log 0.1.8
----------------
- Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper:
Moreo, A., González, P., & del Coz, J. J. Kernel Density Estimation for Multiclass Quantification.
arXiv preprint arXiv:2401.00490, 2024
- Added different solvers for ACC and PACC quantifiers. In quapy < 0.1.8 these quantifiers try to solve the system
of equations Ax=B exactly (by means of np.linalg.solve). As noted by Mirko Bunse (thanks!), such an exact solution
does sometimes not exist. In cases like this, quapy < 0.1.8 resorted to CC for providing a plausible solution.
@ -21,7 +25,7 @@ Change Log 0.1.8
- classification datasets
- Python API available
- New IFCB (plankton) dataset added. See fetch_IFCB.
- New IFCB (plankton) dataset added (thanks to Pablo González). See qp.datasets.fetch_IFCB.
- Added new evaluation measures NAE, NRAE

View File

@ -119,22 +119,18 @@ class MedianEstimator(BinaryQuantifier):
def _delayed_fit_classifier(self, args):
with qp.util.temp_seed(self.random_state):
print('enter job')
cls_params, training = args
model = deepcopy(self.base_quantifier)
model.set_params(**cls_params)
predictions = model.classifier_fit_predict(training, predict_on=model.val_split)
print('exit job')
return (model, predictions)
def _delayed_fit_aggregation(self, args):
with qp.util.temp_seed(self.random_state):
print('\tenter job')
((model, predictions), q_params), training = args
model = deepcopy(model)
model.set_params(**q_params)
model.aggregation_fit(predictions, training)
print('\texit job')
return model
@ -153,7 +149,6 @@ class MedianEstimator(BinaryQuantifier):
asarray=False
)
else:
print('only 1')
model = self.base_quantifier
model.set_params(**cls_configs[0])
predictions = model.classifier_fit_predict(training, predict_on=model.val_split)
@ -263,9 +258,10 @@ class Ensemble(BaseQuantifier):
print('[Ensemble]' + msg)
def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None):
self._sout('Fit')
if self.policy == 'ds' and not data.binary:
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
if val_split is None:
val_split = self.val_split
@ -288,6 +284,7 @@ class Ensemble(BaseQuantifier):
self.ensemble = qp.util.parallel(
_delayed_new_instance,
tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args,
asarray=False,
n_jobs=self.n_jobs)
# static selection policy (the name of a quantification-oriented error function to minimize)
@ -369,30 +366,31 @@ class Ensemble(BaseQuantifier):
def _ds_policy_get_posteriors(self, data: LabelledCollection):
"""
In the original article, this procedure is not described in a sufficient level of detail. The paper only says
In the original article, there are some aspects regarding this method that are not mentioned. The paper says
that the distribution of posterior probabilities from training and test examples is compared by means of the
Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article,
a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in
general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the
quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not
be correct to generate the posterior probabilities for training documents that have concurred in training the
be correct to generate the posterior probabilities for training instances that have concurred in training the
classifier that generates them.
This function thus generates the posterior probabilities for all training documents in a cross-validation way,
using a LR with hyperparameters that have previously been optimized via grid search in 5FCV.
:return P,f, where P is a ndarray containing the posterior probabilities of the training data, generated via
cross-validation and using an optimized LR, and the function to be used in order to generate posterior
probabilities for test instances.
using LR with hyperparameters that have previously been optimized via grid search in 5FCV.
:param data: a LabelledCollection
:return: (P,f,) where P is an ndarray containing the posterior probabilities of the training data, generated via
cross-validation and using an optimized LR, and the function to be used in order to generate posterior
probabilities for test instances.
"""
X, y = data.Xy
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
optim = GridSearchCV(
lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
).fit(X, y)
param_grid = {'C': np.logspace(-4, 4, 9)}
optim = GridSearchCV(lr_base, param_grid=param_grid, cv=5, n_jobs=self.n_jobs, refit=True).fit(X, y)
posteriors = cross_val_predict(
optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
)
posteriors = cross_val_predict(optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba')
posteriors_generator = optim.best_estimator_.predict_proba
return posteriors, posteriors_generator
@ -463,8 +461,10 @@ def _delayed_new_instance(args):
tr_prevalence = sample.prevalence()
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
if verbose:
print(f'\t\--fit-ended for prev {F.strprev(prev)}')
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)
@ -475,8 +475,9 @@ def _delayed_quantify(args):
def _draw_simplex(ndim, min_val, max_trials=100):
"""
returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions
Returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions
are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform)
:param ndim: number of dimensions of the simplex
:param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since
there is no possible solution.

View File

@ -1,8 +1,8 @@
import pytest
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \
fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, UCI_MULTICLASS_DATASETS,\
fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022, fetch_UCIMulticlassLabelledCollection
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
@ -44,6 +44,15 @@ def test_fetch_UCIDataset(dataset_name):
print('Test set stats')
@pytest.mark.parametrize('dataset_name', UCI_MULTICLASS_DATASETS)
def test_fetch_UCIMultiDataset(dataset_name):
dataset = fetch_UCIMulticlassLabelledCollection(dataset_name)
print(f'Dataset {dataset_name}')
print('Training set stats')
dataset.stats()
print('Test set stats')
@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
def test_fetch_lequa2022(dataset_name):
train, gen_val, gen_test = fetch_lequa2022(dataset_name)

View File

@ -1,12 +1,8 @@
import unittest
from sklearn.linear_model import LogisticRegression
import quapy as qp
from quapy.method.aggregative import *
class HierarchyTestCase(unittest.TestCase):
def test_aggregative(self):
@ -22,8 +18,10 @@ class HierarchyTestCase(unittest.TestCase):
def test_probabilistic(self):
lr = LogisticRegression()
for m in [CC(lr), ACC(lr)]:
self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True)
self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False)
for m in [PCC(lr), PACC(lr)]:
self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False)
self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True)

View File

@ -67,15 +67,16 @@ def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
@pytest.mark.parametrize('dataset', tinydatasets)
@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
qp.environ['SAMPLE_SIZE'] = 20
base_quantifier=base_method(learner())
if isinstance(base_quantifier, BinaryQuantifier) and not dataset.binary:
print(f'skipping the test of binary model {base_quantifier} on non-binary dataset {dataset}')
return
if not dataset.binary and policy=='ds':
print(f'skipping the test of binary policy ds on non-binary dataset {dataset}')
return
model = Ensemble(quantifier=base_quantifier, size=5, policy=policy, n_jobs=-1)
model = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
model.fit(dataset.training)
@ -97,9 +98,7 @@ def test_quanet_method():
qp.environ['SAMPLE_SIZE'] = 100
# load the kindle dataset as text, and convert words to numerical indexes
dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
dataset = Dataset(dataset.training.sampling(200, *dataset.training.prevalence()),
dataset.test.sampling(200, *dataset.test.prevalence()))
dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce(200, 200)
qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
from quapy.classification.neural import CNNnet

View File

@ -3,11 +3,13 @@ import quapy as qp
from quapy.data import LabelledCollection
from quapy.functional import strprev
from sklearn.linear_model import LogisticRegression
import numpy as np
from quapy.method.aggregative import PACC
import quapy.functional as F
class MyTestCase(unittest.TestCase):
def test_prediction_replicability(self):
dataset = qp.datasets.fetch_UCIDataset('yeast')
@ -26,8 +28,8 @@ class MyTestCase(unittest.TestCase):
self.assertEqual(str_prev1, str_prev2) # add assertion here
def test_samping_replicability(self):
import numpy as np
def equal_collections(c1, c2, value=True):
self.assertEqual(np.all(c1.X == c2.X), value)
@ -74,5 +76,36 @@ class MyTestCase(unittest.TestCase):
equal_collections(sample1_te, sample2_te, True)
def test_parallel_replicability(self):
train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').train_test
test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0])
with qp.util.temp_seed(10):
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
pacc.fit(train, val_split=0.5)
prev1 = F.strprev(pacc.quantify(test.instances))
with qp.util.temp_seed(0):
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
pacc.fit(train, val_split=0.5)
prev2 = F.strprev(pacc.quantify(test.instances))
with qp.util.temp_seed(0):
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
pacc.fit(train, val_split=0.5)
prev3 = F.strprev(pacc.quantify(test.instances))
print(prev1)
print(prev2)
print(prev3)
self.assertNotEqual(prev1, prev2)
self.assertEqual(prev2, prev3)
if __name__ == '__main__':
unittest.main()