Merge pull request #2 from HLT-ISTI/packaging

pip package
This commit is contained in:
Andrea Esuli 2021-05-10 13:37:48 +02:00 committed by GitHub
commit 37defb9291
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 204 additions and 30 deletions

View File

@ -2,7 +2,6 @@ Packaging:
========================================== ==========================================
Documentation with sphinx Documentation with sphinx
Document methods with paper references Document methods with paper references
allow for "pip install"
unit-tests unit-tests
New features: New features:

View File

@ -10,7 +10,7 @@ from . import model_selection
from . import classification from . import classification
from quapy.method.base import isprobabilistic, isaggregative from quapy.method.base import isprobabilistic, isaggregative
__version__ = '0.1' __version__ = '0.1.4'
environ = { environ = {
'SAMPLE_SIZE': None, 'SAMPLE_SIZE': None,

View File

@ -1,28 +1,32 @@
from copy import deepcopy from copy import deepcopy
from typing import Union from typing import Union
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer, accuracy_score from sklearn.metrics import f1_score, make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_predict
from tqdm import tqdm from tqdm import tqdm
import numpy as np
from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
import quapy as qp import quapy as qp
from quapy.data import LabelledCollection
from quapy import functional as F from quapy import functional as F
from quapy.data import LabelledCollection
from quapy.evaluation import evaluate from quapy.evaluation import evaluate
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from . import neural
from .base import BaseQuantifier
from quapy.method.aggregative import CC, ACC, PCC, PACC, HDy, EMQ
QuaNet = neural.QuaNetTrainer try:
from . import neural
except ModuleNotFoundError:
neural = None
from .base import BaseQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
if neural:
QuaNet = neural.QuaNetTrainer
else:
QuaNet = "QuaNet is not available due to missing torch package"
class Ensemble(BaseQuantifier): class Ensemble(BaseQuantifier):
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES
""" """
@ -65,9 +69,9 @@ class Ensemble(BaseQuantifier):
if self.verbose: if self.verbose:
print('[Ensemble]' + msg) print('[Ensemble]' + msg)
def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float]=None): def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None):
self.sout('Fit') self.sout('Fit')
if self.policy=='ds' and not data.binary: if self.policy == 'ds' and not data.binary:
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary') raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
if val_split is None: if val_split is None:
val_split = self.val_split val_split = self.val_split
@ -132,7 +136,7 @@ class Ensemble(BaseQuantifier):
tests = [m[3] for m in self.ensemble] tests = [m[3] for m in self.ensemble]
scores = [] scores = []
for i, model in enumerate(self.ensemble): for i, model in enumerate(self.ensemble):
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs)) scores.append(evaluate(model[0], tests[:i] + tests[i + 1:], error, self.n_jobs))
order = np.argsort(scores) order = np.argsort(scores)
self.ensemble = _select_k(self.ensemble, order, k=self.red_size) self.ensemble = _select_k(self.ensemble, order, k=self.red_size)
@ -168,7 +172,7 @@ class Ensemble(BaseQuantifier):
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000) lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
optim = GridSearchCV( optim = GridSearchCV(
lr_base, param_grid={'C': np.logspace(-4,4,9)}, cv=5, n_jobs=self.n_jobs, refit=True lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
).fit(X, y) ).fit(X, y)
posteriors = cross_val_predict( posteriors = cross_val_predict(
@ -204,8 +208,8 @@ class Ensemble(BaseQuantifier):
def get_probability_distribution(posterior_probabilities, bins=8): def get_probability_distribution(posterior_probabilities, bins=8):
assert posterior_probabilities.shape[1]==2, 'the posterior probabilities do not seem to be for a binary problem' assert posterior_probabilities.shape[1] == 2, 'the posterior probabilities do not seem to be for a binary problem'
posterior_probabilities = posterior_probabilities[:,1] # take the positive posteriors only posterior_probabilities = posterior_probabilities[:, 1] # take the positive posteriors only
distribution, _ = np.histogram(posterior_probabilities, bins=bins, range=(0, 1), density=True) distribution, _ = np.histogram(posterior_probabilities, bins=bins, range=(0, 1), density=True)
return distribution return distribution
@ -223,7 +227,7 @@ def _delayed_new_instance(args):
if val_split is not None: if val_split is not None:
if isinstance(val_split, float): if isinstance(val_split, float):
assert 0 < val_split < 1, 'val_split should be in (0,1)' assert 0 < val_split < 1, 'val_split should be in (0,1)'
data, val_split = data.split_stratified(train_prop=1-val_split) data, val_split = data.split_stratified(train_prop=1 - val_split)
sample_index = data.sampling_index(sample_size, *prev) sample_index = data.sampling_index(sample_size, *prev)
sample = data.sampling_from_index(sample_index) sample = data.sampling_from_index(sample_index)
@ -255,7 +259,7 @@ def _draw_simplex(ndim, min_val, max_trials=100):
:return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex :return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex
and R is the simplex subset containing dimensions lower than min_val and R is the simplex subset containing dimensions lower than min_val
""" """
if min_val >= 1/ndim: if min_val >= 1 / ndim:
raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that ' raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that '
f'all its values are >={min_val} (try with a larger value for min_pos)') f'all its values are >={min_val} (try with a larger value for min_pos)')
trials = 0 trials = 0
@ -300,14 +304,15 @@ def _check_error(error):
f'the name of an error function in {qp.error.ERROR_NAMES}') f'the name of an error function in {qp.error.ERROR_NAMES}')
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel:dict=None, **kwargs): def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None,
if optim is not None: **kwargs):
if param_grid is None: if optim is not None:
raise ValueError(f'param_grid is None but optim was requested.') if param_grid is None:
if param_model_sel is None: raise ValueError(f'param_grid is None but optim was requested.')
raise ValueError(f'param_model_sel is None but optim was requested.') if param_model_sel is None:
error = _check_error(optim) raise ValueError(f'param_model_sel is None but optim was requested.')
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs) error = _check_error(optim)
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs): def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
@ -327,4 +332,4 @@ def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs): def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs) return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)

View File

@ -100,6 +100,12 @@ def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
def test_quanet_method(): def test_quanet_method():
try:
import quapy.classification.neural
except ModuleNotFoundError:
print('skipping QuaNet test due to missing torch package')
return
dataset = qp.datasets.fetch_reviews('kindle', pickle=True) dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()), dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()),
dataset.test.sampling(100, *dataset.test.prevalence())) dataset.test.sampling(100, *dataset.test.prevalence()))

164
setup.py Normal file
View File

@ -0,0 +1,164 @@
from setuptools import setup, find_packages
import pathlib
here = pathlib.Path(__file__).parent.resolve()
long_description = (here / 'README.md').read_text(encoding='utf-8')
def get_version(rel_path):
init_content = (here / rel_path).read_text(encoding='utf-8')
for line in init_content.split('\n'):
if line.startswith('__version__'):
delim = '"' if '"' in line else "'"
return line.split(delim)[1]
else:
raise RuntimeError("Unable to find version string.")
# Arguments marked as "Required" below must be included for upload to PyPI.
# Fields marked as "Optional" may be commented out.
setup(
# This is the name of your project. The first time you publish this
# package, this name will be registered for you. It will determine how
# users can install this project, e.g.:
#
# $ pip install sampleproject
#
# And where it will live on PyPI: https://pypi.org/project/sampleproject/
#
# There are some restrictions on what makes a valid project name
# specification here:
# https://packaging.python.org/specifications/core-metadata/#name
name='QuaPy', # Required
# Versions should comply with PEP 440:
# https://www.python.org/dev/peps/pep-0440/
#
# For a discussion on single-sourcing the version across setup.py and the
# project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version=get_version("quapy/__init__.py"), # Required
# This is a one-line description or tagline of what your project does. This
# corresponds to the "Summary" metadata field:
# https://packaging.python.org/specifications/core-metadata/#summary
description='QuaPy: a framework for Quantification in Python', # Optional
# This is an optional longer description of your project that represents
# the body of text which users will see when they visit PyPI.
#
# Often, this is the same as your README, so you can just read it in from
# that file directly (as we have already done above)
#
# This field corresponds to the "Description" metadata field:
# https://packaging.python.org/specifications/core-metadata/#description-optional
long_description=long_description, # Optional
# Denotes that our long_description is in Markdown; valid values are
# text/plain, text/x-rst, and text/markdown
#
# Optional if long_description is written in reStructuredText (rst) but
# required for plain-text or Markdown; if unspecified, "applications should
# attempt to render [the long_description] as text/x-rst; charset=UTF-8 and
# fall back to text/plain if it is not valid rst" (see link below)
#
# This field corresponds to the "Description-Content-Type" metadata field:
# https://packaging.python.org/specifications/core-metadata/#description-content-type-optional
long_description_content_type='text/markdown', # Optional (see note above)
# This should be a valid link to your project's main homepage.
#
# This field corresponds to the "Home-Page" metadata field:
# https://packaging.python.org/specifications/core-metadata/#home-page-optional
url='https://github.com/HLT-ISTI/QuaPy', # Optional
maintainer='Alejandro Moreo',
maintainer_email='alejandro.moreo@isti.cnr.it',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Intended Audience :: Science/Research',
'Programming Language :: Python',
'Topic :: Software Development',
'Topic :: Scientific/Engineering',
'License :: OSI Approved :: BSD License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3 :: Only',
],
keywords='machine learning, quantification, classification, prevalence estimation, priors estimate',
# When your source code is in a subdirectory under the project root, e.g.
# `src/`, it is necessary to specify the `package_dir` argument.
#package_dir={'': 'src'}, # Optional
# You can just specify package directories manually here if your project is
# simple. Or you can use find_packages().
#
# Alternatively, if you just want to distribute a single Python file, use
# the `py_modules` argument instead as follows, which will expect a file
# called `my_module.py` to exist:
#
# py_modules=["my_module"],
#
packages=find_packages(include=['quapy', 'quapy.*']), # Required
python_requires='>=3.6, <4',
install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib'],
# List additional groups of dependencies here (e.g. development
# dependencies). Users will be able to install these using the "extras"
# syntax, for example:
#
# $ pip install sampleproject[dev]
#
# Similar to `install_requires` above, these must be valid existing
# projects.
# extras_require={ # Optional
# 'dev': ['check-manifest'],
# 'test': ['coverage'],
# },
# If there are data files included in your packages that need to be
# installed, specify them here.
# package_data={ # Optional
# 'sample': ['package_data.dat'],
# },
# Although 'package_data' is the preferred approach, in some case you may
# need to place data files outside of your packages. See:
# http://docs.python.org/distutils/setupscript.html#installing-additional-files
#
# In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
# data_files=[('my_data', ['data/data_file'])], # Optional
# To provide executable scripts, use entry points in preference to the
# "scripts" keyword. Entry points provide cross-platform support and allow
# `pip` to create the appropriate form of executable for the target
# platform.
#
# For example, the following would provide a command called `sample` which
# executes the function `main` from this package when invoked:
# entry_points={ # Optional
# 'console_scripts': [
# 'sample=sample:main',
# ],
# },
project_urls={ # Optional
'Contributors': 'https://github.com/HLT-ISTI/QuaPy/graphs/contributors',
'Bug Reports': 'https://github.com/HLT-ISTI/QuaPy/issues',
'Documentation': 'https://github.com/HLT-ISTI/QuaPy/wiki',
'Source': 'https://github.com/HLT-ISTI/QuaPy/',
},
)