commit
37defb9291
1
TODO.txt
1
TODO.txt
|
@ -2,7 +2,6 @@ Packaging:
|
||||||
==========================================
|
==========================================
|
||||||
Documentation with sphinx
|
Documentation with sphinx
|
||||||
Document methods with paper references
|
Document methods with paper references
|
||||||
allow for "pip install"
|
|
||||||
unit-tests
|
unit-tests
|
||||||
|
|
||||||
New features:
|
New features:
|
||||||
|
|
|
@ -10,7 +10,7 @@ from . import model_selection
|
||||||
from . import classification
|
from . import classification
|
||||||
from quapy.method.base import isprobabilistic, isaggregative
|
from quapy.method.base import isprobabilistic, isaggregative
|
||||||
|
|
||||||
__version__ = '0.1'
|
__version__ = '0.1.4'
|
||||||
|
|
||||||
environ = {
|
environ = {
|
||||||
'SAMPLE_SIZE': None,
|
'SAMPLE_SIZE': None,
|
||||||
|
|
|
@ -1,28 +1,32 @@
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.metrics import f1_score, make_scorer, accuracy_score
|
from sklearn.metrics import f1_score, make_scorer, accuracy_score
|
||||||
|
from sklearn.model_selection import GridSearchCV, cross_val_predict
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from joblib import Parallel, delayed
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from sklearn.model_selection import GridSearchCV, cross_val_predict
|
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.data import LabelledCollection
|
|
||||||
from quapy import functional as F
|
from quapy import functional as F
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
from quapy.evaluation import evaluate
|
from quapy.evaluation import evaluate
|
||||||
from quapy.model_selection import GridSearchQ
|
from quapy.model_selection import GridSearchQ
|
||||||
from . import neural
|
|
||||||
from .base import BaseQuantifier
|
|
||||||
from quapy.method.aggregative import CC, ACC, PCC, PACC, HDy, EMQ
|
|
||||||
|
|
||||||
QuaNet = neural.QuaNetTrainer
|
try:
|
||||||
|
from . import neural
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
neural = None
|
||||||
|
from .base import BaseQuantifier
|
||||||
|
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
|
||||||
|
|
||||||
|
if neural:
|
||||||
|
QuaNet = neural.QuaNetTrainer
|
||||||
|
else:
|
||||||
|
QuaNet = "QuaNet is not available due to missing torch package"
|
||||||
|
|
||||||
|
|
||||||
class Ensemble(BaseQuantifier):
|
class Ensemble(BaseQuantifier):
|
||||||
|
|
||||||
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES
|
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -65,9 +69,9 @@ class Ensemble(BaseQuantifier):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print('[Ensemble]' + msg)
|
print('[Ensemble]' + msg)
|
||||||
|
|
||||||
def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float]=None):
|
def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None):
|
||||||
self.sout('Fit')
|
self.sout('Fit')
|
||||||
if self.policy=='ds' and not data.binary:
|
if self.policy == 'ds' and not data.binary:
|
||||||
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
|
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
|
||||||
if val_split is None:
|
if val_split is None:
|
||||||
val_split = self.val_split
|
val_split = self.val_split
|
||||||
|
@ -132,7 +136,7 @@ class Ensemble(BaseQuantifier):
|
||||||
tests = [m[3] for m in self.ensemble]
|
tests = [m[3] for m in self.ensemble]
|
||||||
scores = []
|
scores = []
|
||||||
for i, model in enumerate(self.ensemble):
|
for i, model in enumerate(self.ensemble):
|
||||||
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
|
scores.append(evaluate(model[0], tests[:i] + tests[i + 1:], error, self.n_jobs))
|
||||||
order = np.argsort(scores)
|
order = np.argsort(scores)
|
||||||
|
|
||||||
self.ensemble = _select_k(self.ensemble, order, k=self.red_size)
|
self.ensemble = _select_k(self.ensemble, order, k=self.red_size)
|
||||||
|
@ -168,7 +172,7 @@ class Ensemble(BaseQuantifier):
|
||||||
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
|
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000)
|
||||||
|
|
||||||
optim = GridSearchCV(
|
optim = GridSearchCV(
|
||||||
lr_base, param_grid={'C': np.logspace(-4,4,9)}, cv=5, n_jobs=self.n_jobs, refit=True
|
lr_base, param_grid={'C': np.logspace(-4, 4, 9)}, cv=5, n_jobs=self.n_jobs, refit=True
|
||||||
).fit(X, y)
|
).fit(X, y)
|
||||||
|
|
||||||
posteriors = cross_val_predict(
|
posteriors = cross_val_predict(
|
||||||
|
@ -204,8 +208,8 @@ class Ensemble(BaseQuantifier):
|
||||||
|
|
||||||
|
|
||||||
def get_probability_distribution(posterior_probabilities, bins=8):
|
def get_probability_distribution(posterior_probabilities, bins=8):
|
||||||
assert posterior_probabilities.shape[1]==2, 'the posterior probabilities do not seem to be for a binary problem'
|
assert posterior_probabilities.shape[1] == 2, 'the posterior probabilities do not seem to be for a binary problem'
|
||||||
posterior_probabilities = posterior_probabilities[:,1] # take the positive posteriors only
|
posterior_probabilities = posterior_probabilities[:, 1] # take the positive posteriors only
|
||||||
distribution, _ = np.histogram(posterior_probabilities, bins=bins, range=(0, 1), density=True)
|
distribution, _ = np.histogram(posterior_probabilities, bins=bins, range=(0, 1), density=True)
|
||||||
return distribution
|
return distribution
|
||||||
|
|
||||||
|
@ -223,7 +227,7 @@ def _delayed_new_instance(args):
|
||||||
if val_split is not None:
|
if val_split is not None:
|
||||||
if isinstance(val_split, float):
|
if isinstance(val_split, float):
|
||||||
assert 0 < val_split < 1, 'val_split should be in (0,1)'
|
assert 0 < val_split < 1, 'val_split should be in (0,1)'
|
||||||
data, val_split = data.split_stratified(train_prop=1-val_split)
|
data, val_split = data.split_stratified(train_prop=1 - val_split)
|
||||||
|
|
||||||
sample_index = data.sampling_index(sample_size, *prev)
|
sample_index = data.sampling_index(sample_size, *prev)
|
||||||
sample = data.sampling_from_index(sample_index)
|
sample = data.sampling_from_index(sample_index)
|
||||||
|
@ -255,7 +259,7 @@ def _draw_simplex(ndim, min_val, max_trials=100):
|
||||||
:return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex
|
:return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex
|
||||||
and R is the simplex subset containing dimensions lower than min_val
|
and R is the simplex subset containing dimensions lower than min_val
|
||||||
"""
|
"""
|
||||||
if min_val >= 1/ndim:
|
if min_val >= 1 / ndim:
|
||||||
raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that '
|
raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that '
|
||||||
f'all its values are >={min_val} (try with a larger value for min_pos)')
|
f'all its values are >={min_val} (try with a larger value for min_pos)')
|
||||||
trials = 0
|
trials = 0
|
||||||
|
@ -300,14 +304,15 @@ def _check_error(error):
|
||||||
f'the name of an error function in {qp.error.ERROR_NAMES}')
|
f'the name of an error function in {qp.error.ERROR_NAMES}')
|
||||||
|
|
||||||
|
|
||||||
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel:dict=None, **kwargs):
|
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None,
|
||||||
if optim is not None:
|
**kwargs):
|
||||||
if param_grid is None:
|
if optim is not None:
|
||||||
raise ValueError(f'param_grid is None but optim was requested.')
|
if param_grid is None:
|
||||||
if param_model_sel is None:
|
raise ValueError(f'param_grid is None but optim was requested.')
|
||||||
raise ValueError(f'param_model_sel is None but optim was requested.')
|
if param_model_sel is None:
|
||||||
error = _check_error(optim)
|
raise ValueError(f'param_model_sel is None but optim was requested.')
|
||||||
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
|
error = _check_error(optim)
|
||||||
|
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||||
|
@ -327,4 +332,4 @@ def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||||
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)
|
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)
|
||||||
|
|
|
@ -100,6 +100,12 @@ def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
|
||||||
|
|
||||||
|
|
||||||
def test_quanet_method():
|
def test_quanet_method():
|
||||||
|
try:
|
||||||
|
import quapy.classification.neural
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
print('skipping QuaNet test due to missing torch package')
|
||||||
|
return
|
||||||
|
|
||||||
dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
|
dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
|
||||||
dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()),
|
dataset = Dataset(dataset.training.sampling(100, *dataset.training.prevalence()),
|
||||||
dataset.test.sampling(100, *dataset.test.prevalence()))
|
dataset.test.sampling(100, *dataset.test.prevalence()))
|
||||||
|
|
|
@ -0,0 +1,164 @@
|
||||||
|
from setuptools import setup, find_packages
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
here = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
long_description = (here / 'README.md').read_text(encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
def get_version(rel_path):
|
||||||
|
init_content = (here / rel_path).read_text(encoding='utf-8')
|
||||||
|
for line in init_content.split('\n'):
|
||||||
|
if line.startswith('__version__'):
|
||||||
|
delim = '"' if '"' in line else "'"
|
||||||
|
return line.split(delim)[1]
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Unable to find version string.")
|
||||||
|
# Arguments marked as "Required" below must be included for upload to PyPI.
|
||||||
|
# Fields marked as "Optional" may be commented out.
|
||||||
|
|
||||||
|
setup(
|
||||||
|
# This is the name of your project. The first time you publish this
|
||||||
|
# package, this name will be registered for you. It will determine how
|
||||||
|
# users can install this project, e.g.:
|
||||||
|
#
|
||||||
|
# $ pip install sampleproject
|
||||||
|
#
|
||||||
|
# And where it will live on PyPI: https://pypi.org/project/sampleproject/
|
||||||
|
#
|
||||||
|
# There are some restrictions on what makes a valid project name
|
||||||
|
# specification here:
|
||||||
|
# https://packaging.python.org/specifications/core-metadata/#name
|
||||||
|
name='QuaPy', # Required
|
||||||
|
|
||||||
|
# Versions should comply with PEP 440:
|
||||||
|
# https://www.python.org/dev/peps/pep-0440/
|
||||||
|
#
|
||||||
|
# For a discussion on single-sourcing the version across setup.py and the
|
||||||
|
# project code, see
|
||||||
|
# https://packaging.python.org/en/latest/single_source_version.html
|
||||||
|
version=get_version("quapy/__init__.py"), # Required
|
||||||
|
|
||||||
|
# This is a one-line description or tagline of what your project does. This
|
||||||
|
# corresponds to the "Summary" metadata field:
|
||||||
|
# https://packaging.python.org/specifications/core-metadata/#summary
|
||||||
|
description='QuaPy: a framework for Quantification in Python', # Optional
|
||||||
|
|
||||||
|
# This is an optional longer description of your project that represents
|
||||||
|
# the body of text which users will see when they visit PyPI.
|
||||||
|
#
|
||||||
|
# Often, this is the same as your README, so you can just read it in from
|
||||||
|
# that file directly (as we have already done above)
|
||||||
|
#
|
||||||
|
# This field corresponds to the "Description" metadata field:
|
||||||
|
# https://packaging.python.org/specifications/core-metadata/#description-optional
|
||||||
|
long_description=long_description, # Optional
|
||||||
|
|
||||||
|
# Denotes that our long_description is in Markdown; valid values are
|
||||||
|
# text/plain, text/x-rst, and text/markdown
|
||||||
|
#
|
||||||
|
# Optional if long_description is written in reStructuredText (rst) but
|
||||||
|
# required for plain-text or Markdown; if unspecified, "applications should
|
||||||
|
# attempt to render [the long_description] as text/x-rst; charset=UTF-8 and
|
||||||
|
# fall back to text/plain if it is not valid rst" (see link below)
|
||||||
|
#
|
||||||
|
# This field corresponds to the "Description-Content-Type" metadata field:
|
||||||
|
# https://packaging.python.org/specifications/core-metadata/#description-content-type-optional
|
||||||
|
long_description_content_type='text/markdown', # Optional (see note above)
|
||||||
|
|
||||||
|
# This should be a valid link to your project's main homepage.
|
||||||
|
#
|
||||||
|
# This field corresponds to the "Home-Page" metadata field:
|
||||||
|
# https://packaging.python.org/specifications/core-metadata/#home-page-optional
|
||||||
|
url='https://github.com/HLT-ISTI/QuaPy', # Optional
|
||||||
|
|
||||||
|
maintainer='Alejandro Moreo',
|
||||||
|
|
||||||
|
maintainer_email='alejandro.moreo@isti.cnr.it',
|
||||||
|
|
||||||
|
classifiers=[
|
||||||
|
'Development Status :: 4 - Beta',
|
||||||
|
|
||||||
|
'Intended Audience :: Developers',
|
||||||
|
'Intended Audience :: Science/Research',
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Topic :: Software Development',
|
||||||
|
'Topic :: Scientific/Engineering',
|
||||||
|
|
||||||
|
'License :: OSI Approved :: BSD License',
|
||||||
|
|
||||||
|
'Programming Language :: Python :: 3',
|
||||||
|
'Programming Language :: Python :: 3.6',
|
||||||
|
'Programming Language :: Python :: 3.7',
|
||||||
|
'Programming Language :: Python :: 3.8',
|
||||||
|
'Programming Language :: Python :: 3.9',
|
||||||
|
'Programming Language :: Python :: 3 :: Only',
|
||||||
|
],
|
||||||
|
|
||||||
|
keywords='machine learning, quantification, classification, prevalence estimation, priors estimate',
|
||||||
|
|
||||||
|
# When your source code is in a subdirectory under the project root, e.g.
|
||||||
|
# `src/`, it is necessary to specify the `package_dir` argument.
|
||||||
|
#package_dir={'': 'src'}, # Optional
|
||||||
|
|
||||||
|
# You can just specify package directories manually here if your project is
|
||||||
|
# simple. Or you can use find_packages().
|
||||||
|
#
|
||||||
|
# Alternatively, if you just want to distribute a single Python file, use
|
||||||
|
# the `py_modules` argument instead as follows, which will expect a file
|
||||||
|
# called `my_module.py` to exist:
|
||||||
|
#
|
||||||
|
# py_modules=["my_module"],
|
||||||
|
#
|
||||||
|
packages=find_packages(include=['quapy', 'quapy.*']), # Required
|
||||||
|
|
||||||
|
python_requires='>=3.6, <4',
|
||||||
|
|
||||||
|
install_requires=['scikit-learn', 'pandas', 'tqdm', 'matplotlib'],
|
||||||
|
|
||||||
|
# List additional groups of dependencies here (e.g. development
|
||||||
|
# dependencies). Users will be able to install these using the "extras"
|
||||||
|
# syntax, for example:
|
||||||
|
#
|
||||||
|
# $ pip install sampleproject[dev]
|
||||||
|
#
|
||||||
|
# Similar to `install_requires` above, these must be valid existing
|
||||||
|
# projects.
|
||||||
|
# extras_require={ # Optional
|
||||||
|
# 'dev': ['check-manifest'],
|
||||||
|
# 'test': ['coverage'],
|
||||||
|
# },
|
||||||
|
|
||||||
|
# If there are data files included in your packages that need to be
|
||||||
|
# installed, specify them here.
|
||||||
|
# package_data={ # Optional
|
||||||
|
# 'sample': ['package_data.dat'],
|
||||||
|
# },
|
||||||
|
|
||||||
|
# Although 'package_data' is the preferred approach, in some case you may
|
||||||
|
# need to place data files outside of your packages. See:
|
||||||
|
# http://docs.python.org/distutils/setupscript.html#installing-additional-files
|
||||||
|
#
|
||||||
|
# In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
|
||||||
|
# data_files=[('my_data', ['data/data_file'])], # Optional
|
||||||
|
|
||||||
|
# To provide executable scripts, use entry points in preference to the
|
||||||
|
# "scripts" keyword. Entry points provide cross-platform support and allow
|
||||||
|
# `pip` to create the appropriate form of executable for the target
|
||||||
|
# platform.
|
||||||
|
#
|
||||||
|
# For example, the following would provide a command called `sample` which
|
||||||
|
# executes the function `main` from this package when invoked:
|
||||||
|
# entry_points={ # Optional
|
||||||
|
# 'console_scripts': [
|
||||||
|
# 'sample=sample:main',
|
||||||
|
# ],
|
||||||
|
# },
|
||||||
|
|
||||||
|
project_urls={ # Optional
|
||||||
|
'Contributors': 'https://github.com/HLT-ISTI/QuaPy/graphs/contributors',
|
||||||
|
'Bug Reports': 'https://github.com/HLT-ISTI/QuaPy/issues',
|
||||||
|
'Documentation': 'https://github.com/HLT-ISTI/QuaPy/wiki',
|
||||||
|
'Source': 'https://github.com/HLT-ISTI/QuaPy/',
|
||||||
|
},
|
||||||
|
)
|
Loading…
Reference in New Issue