1
0
Fork 0

passing pytests

This commit is contained in:
Alejandro Moreo Fernandez 2024-01-29 09:43:29 +01:00
parent e6dcfbced1
commit 2f2e48d86a
6 changed files with 79 additions and 35 deletions

View File

@ -1,6 +1,10 @@
Change Log 0.1.8 Change Log 0.1.8
---------------- ----------------
- Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper:
Moreo, A., González, P., & del Coz, J. J. Kernel Density Estimation for Multiclass Quantification.
arXiv preprint arXiv:2401.00490, 2024
- Added different solvers for ACC and PACC quantifiers. In quapy < 0.1.8 these quantifiers try to solve the system - Added different solvers for ACC and PACC quantifiers. In quapy < 0.1.8 these quantifiers try to solve the system
of equations Ax=B exactly (by means of np.linalg.solve). As noted by Mirko Bunse (thanks!), such an exact solution of equations Ax=B exactly (by means of np.linalg.solve). As noted by Mirko Bunse (thanks!), such an exact solution
does sometimes not exist. In cases like this, quapy < 0.1.8 resorted to CC for providing a plausible solution. does sometimes not exist. In cases like this, quapy < 0.1.8 resorted to CC for providing a plausible solution.
@ -21,7 +25,7 @@ Change Log 0.1.8
- classification datasets - classification datasets
- Python API available - Python API available
- New IFCB (plankton) dataset added. See fetch_IFCB. - New IFCB (plankton) dataset added (thanks to Pablo González). See qp.datasets.fetch_IFCB.
- Added new evaluation measures NAE, NRAE - Added new evaluation measures NAE, NRAE

View File

@ -119,22 +119,18 @@ class MedianEstimator(BinaryQuantifier):
def _delayed_fit_classifier(self, args): def _delayed_fit_classifier(self, args):
with qp.util.temp_seed(self.random_state): with qp.util.temp_seed(self.random_state):
print('enter job')
cls_params, training = args cls_params, training = args
model = deepcopy(self.base_quantifier) model = deepcopy(self.base_quantifier)
model.set_params(**cls_params) model.set_params(**cls_params)
predictions = model.classifier_fit_predict(training, predict_on=model.val_split) predictions = model.classifier_fit_predict(training, predict_on=model.val_split)
print('exit job')
return (model, predictions) return (model, predictions)
def _delayed_fit_aggregation(self, args): def _delayed_fit_aggregation(self, args):
with qp.util.temp_seed(self.random_state): with qp.util.temp_seed(self.random_state):
print('\tenter job')
((model, predictions), q_params), training = args ((model, predictions), q_params), training = args
model = deepcopy(model) model = deepcopy(model)
model.set_params(**q_params) model.set_params(**q_params)
model.aggregation_fit(predictions, training) model.aggregation_fit(predictions, training)
print('\texit job')
return model return model
@ -153,7 +149,6 @@ class MedianEstimator(BinaryQuantifier):
asarray=False asarray=False
) )
else: else:
print('only 1')
model = self.base_quantifier model = self.base_quantifier
model.set_params(**cls_configs[0]) model.set_params(**cls_configs[0])
predictions = model.classifier_fit_predict(training, predict_on=model.val_split) predictions = model.classifier_fit_predict(training, predict_on=model.val_split)
@ -263,9 +258,10 @@ class Ensemble(BaseQuantifier):
print('[Ensemble]' + msg) print('[Ensemble]' + msg)
def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None): def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None):
self._sout('Fit')
if self.policy == 'ds' and not data.binary: if self.policy == 'ds' and not data.binary:
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary') raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
if val_split is None: if val_split is None:
val_split = self.val_split val_split = self.val_split
@ -288,6 +284,7 @@ class Ensemble(BaseQuantifier):
self.ensemble = qp.util.parallel( self.ensemble = qp.util.parallel(
_delayed_new_instance, _delayed_new_instance,
tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args, tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args,
asarray=False,
n_jobs=self.n_jobs) n_jobs=self.n_jobs)
# static selection policy (the name of a quantification-oriented error function to minimize) # static selection policy (the name of a quantification-oriented error function to minimize)
@ -369,30 +366,31 @@ class Ensemble(BaseQuantifier):
def _ds_policy_get_posteriors(self, data: LabelledCollection): def _ds_policy_get_posteriors(self, data: LabelledCollection):
""" """
In the original article, this procedure is not described in a sufficient level of detail. The paper only says In the original article, there are some aspects regarding this method that are not mentioned. The paper says
that the distribution of posterior probabilities from training and test examples is compared by means of the that the distribution of posterior probabilities from training and test examples is compared by means of the
Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article, Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article,
a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in
general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the
quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not
be correct to generate the posterior probabilities for training documents that have concurred in training the be correct to generate the posterior probabilities for training instances that have concurred in training the
classifier that generates them. classifier that generates them.
This function thus generates the posterior probabilities for all training documents in a cross-validation way, This function thus generates the posterior probabilities for all training documents in a cross-validation way,
using a LR with hyperparameters that have previously been optimized via grid search in 5FCV. using LR with hyperparameters that have previously been optimized via grid search in 5FCV.
:return P,f, where P is a ndarray containing the posterior probabilities of the training data, generated via
cross-validation and using an optimized LR, and the function to be used in order to generate posterior :param data: a LabelledCollection
probabilities for test instances. :return: (P,f,) where P is an ndarray containing the posterior probabilities of the training data, generated via
cross-validation and using an optimized LR, and the function to be used in order to generate posterior
probabilities for test instances.
""" """
X, y = data.Xy X, y = data.Xy
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000) lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
optim = GridSearchCV( param_grid = {'C': np.logspace(-4, 4, 9)}
lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True optim = GridSearchCV(lr_base, param_grid=param_grid, cv=5, n_jobs=self.n_jobs, refit=True).fit(X, y)
).fit(X, y)
posteriors = cross_val_predict( posteriors = cross_val_predict(optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba')
optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba'
)
posteriors_generator = optim.best_estimator_.predict_proba posteriors_generator = optim.best_estimator_.predict_proba
return posteriors, posteriors_generator return posteriors, posteriors_generator
@ -463,8 +461,10 @@ def _delayed_new_instance(args):
tr_prevalence = sample.prevalence() tr_prevalence = sample.prevalence()
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
if verbose: if verbose:
print(f'\t\--fit-ended for prev {F.strprev(prev)}') print(f'\t\--fit-ended for prev {F.strprev(prev)}')
return (model, tr_prevalence, tr_distribution, sample if keep_samples else None) return (model, tr_prevalence, tr_distribution, sample if keep_samples else None)
@ -475,8 +475,9 @@ def _delayed_quantify(args):
def _draw_simplex(ndim, min_val, max_trials=100): def _draw_simplex(ndim, min_val, max_trials=100):
""" """
returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions Returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions
are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform) are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform)
:param ndim: number of dimensions of the simplex :param ndim: number of dimensions of the simplex
:param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since :param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since
there is no possible solution. there is no possible solution.

View File

@ -1,8 +1,8 @@
import pytest import pytest
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \ from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \ TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, UCI_MULTICLASS_DATASETS,\
fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022 fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022, fetch_UCIMulticlassLabelledCollection
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS) @pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
@ -44,6 +44,15 @@ def test_fetch_UCIDataset(dataset_name):
print('Test set stats') print('Test set stats')
@pytest.mark.parametrize('dataset_name', UCI_MULTICLASS_DATASETS)
def test_fetch_UCIMultiDataset(dataset_name):
dataset = fetch_UCIMulticlassLabelledCollection(dataset_name)
print(f'Dataset {dataset_name}')
print('Training set stats')
dataset.stats()
print('Test set stats')
@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS) @pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
def test_fetch_lequa2022(dataset_name): def test_fetch_lequa2022(dataset_name):
train, gen_val, gen_test = fetch_lequa2022(dataset_name) train, gen_val, gen_test = fetch_lequa2022(dataset_name)

View File

@ -1,12 +1,8 @@
import unittest import unittest
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import quapy as qp
from quapy.method.aggregative import * from quapy.method.aggregative import *
class HierarchyTestCase(unittest.TestCase): class HierarchyTestCase(unittest.TestCase):
def test_aggregative(self): def test_aggregative(self):
@ -22,8 +18,10 @@ class HierarchyTestCase(unittest.TestCase):
def test_probabilistic(self): def test_probabilistic(self):
lr = LogisticRegression() lr = LogisticRegression()
for m in [CC(lr), ACC(lr)]: for m in [CC(lr), ACC(lr)]:
self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True)
self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False) self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False)
for m in [PCC(lr), PACC(lr)]: for m in [PCC(lr), PACC(lr)]:
self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False)
self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True) self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True)

View File

@ -67,15 +67,16 @@ def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
@pytest.mark.parametrize('dataset', tinydatasets) @pytest.mark.parametrize('dataset', tinydatasets)
@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES) @pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES)
def test_ensemble_method(base_method, learner, dataset: Dataset, policy): def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
qp.environ['SAMPLE_SIZE'] = 20 qp.environ['SAMPLE_SIZE'] = 20
base_quantifier=base_method(learner()) base_quantifier=base_method(learner())
if isinstance(base_quantifier, BinaryQuantifier) and not dataset.binary:
print(f'skipping the test of binary model {base_quantifier} on non-binary dataset {dataset}')
return
if not dataset.binary and policy=='ds': if not dataset.binary and policy=='ds':
print(f'skipping the test of binary policy ds on non-binary dataset {dataset}') print(f'skipping the test of binary policy ds on non-binary dataset {dataset}')
return return
model = Ensemble(quantifier=base_quantifier, size=5, policy=policy, n_jobs=-1)
model = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1)
model.fit(dataset.training) model.fit(dataset.training)
@ -97,9 +98,7 @@ def test_quanet_method():
qp.environ['SAMPLE_SIZE'] = 100 qp.environ['SAMPLE_SIZE'] = 100
# load the kindle dataset as text, and convert words to numerical indexes # load the kindle dataset as text, and convert words to numerical indexes
dataset = qp.datasets.fetch_reviews('kindle', pickle=True) dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce(200, 200)
dataset = Dataset(dataset.training.sampling(200, *dataset.training.prevalence()),
dataset.test.sampling(200, *dataset.test.prevalence()))
qp.data.preprocessing.index(dataset, min_df=5, inplace=True) qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
from quapy.classification.neural import CNNnet from quapy.classification.neural import CNNnet

View File

@ -3,11 +3,13 @@ import quapy as qp
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.functional import strprev from quapy.functional import strprev
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import numpy as np
from quapy.method.aggregative import PACC from quapy.method.aggregative import PACC
import quapy.functional as F
class MyTestCase(unittest.TestCase): class MyTestCase(unittest.TestCase):
def test_prediction_replicability(self): def test_prediction_replicability(self):
dataset = qp.datasets.fetch_UCIDataset('yeast') dataset = qp.datasets.fetch_UCIDataset('yeast')
@ -26,8 +28,8 @@ class MyTestCase(unittest.TestCase):
self.assertEqual(str_prev1, str_prev2) # add assertion here self.assertEqual(str_prev1, str_prev2) # add assertion here
def test_samping_replicability(self): def test_samping_replicability(self):
import numpy as np
def equal_collections(c1, c2, value=True): def equal_collections(c1, c2, value=True):
self.assertEqual(np.all(c1.X == c2.X), value) self.assertEqual(np.all(c1.X == c2.X), value)
@ -74,5 +76,36 @@ class MyTestCase(unittest.TestCase):
equal_collections(sample1_te, sample2_te, True) equal_collections(sample1_te, sample2_te, True)
def test_parallel_replicability(self):
train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').train_test
test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0])
with qp.util.temp_seed(10):
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
pacc.fit(train, val_split=0.5)
prev1 = F.strprev(pacc.quantify(test.instances))
with qp.util.temp_seed(0):
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
pacc.fit(train, val_split=0.5)
prev2 = F.strprev(pacc.quantify(test.instances))
with qp.util.temp_seed(0):
pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2)
pacc.fit(train, val_split=0.5)
prev3 = F.strprev(pacc.quantify(test.instances))
print(prev1)
print(prev2)
print(prev3)
self.assertNotEqual(prev1, prev2)
self.assertEqual(prev2, prev3)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()