diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 17a6c39..dd40642 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,8 @@ on: branches: - master - devel + tags: + - "[0-9]+.[0-9]+.[0-9]+" jobs: @@ -28,7 +30,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4" + python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@main" python -m pip install -e .[bayes,tests] - name: Test with unittest run: python -m unittest @@ -47,7 +49,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel "jax[cpu]" - python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4" + python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@main" python -m pip install -e .[neural,docs] - name: Build documentation run: sphinx-build -M html docs/source docs/build @@ -66,3 +68,41 @@ jobs: branch: gh-pages directory: __gh-pages/ github_token: ${{ secrets.GITHUB_TOKEN }} + + release: + name: Build & Publish Release + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/') + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install build dependencies + run: | + python -m pip install --upgrade pip build twine + - name: Build package + run: python -m build + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + # use these for TESTs! +# password: ${{ secrets.TEST_PYPI_API_TOKEN }} +# repository_url: https://test.pypi.org/legacy/ + password: ${{ secrets.PYPI_API_TOKEN }} + repository_url: https://upload.pypi.org/legacy/ + - name: Create GitHub Release + id: create_release + uses: actions/create-release@v1 + with: + tag_name: ${{ github.ref_name }} + release_name: Release ${{ github.ref_name }} + body: | + Changes in this release: + - see commit history for details + draft: false + prerelease: false + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 64c94fa..9253112 100644 --- a/.gitignore +++ b/.gitignore @@ -73,7 +73,8 @@ instance/ .vscode/ # Sphinx documentation -docs/_build/ +docs/_build/doctest +docs/_build/doctrees # PyBuilder target/ diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index 48ee2f3..b6066b9 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -1,12 +1,17 @@ -Change Log 0.1.10 +Change Log 0.2.1 ----------------- -CLEAN TODO-FILE +- Improved documentation of confidence regions. +- Added ReadMe method by Daniel Hopkins and Gary King +- Internal index in LabelledCollection is now "lazy", and is only constructed if required. + +Change Log 0.2.0 +----------------- - Base code Refactor: - Removing coupling between LabelledCollection and quantification methods; the fit interface changes: def fit(data:LabelledCollection): -> def fit(X, y): - - Adding function "predict" (function "quantify" is still present as an alias) + - Adding function "predict" (function "quantify" is still present as an alias, for the nostalgic) - Aggregative methods's behavior in terms of fit_classifier and how to treat the val_split is now indicated exclusively at construction time, and it is no longer possible to indicate it at fit time. This is because, in v<=0.1.9, one could create a method (e.g., ACC) and then indicate: @@ -21,15 +26,16 @@ CLEAN TODO-FILE - A new parameter "on_calib_error" is passed to the constructor, which informs of the policy to follow in case the abstention's calibration functions failed (which happens sometimes). Options include: - 'raise': raises a RuntimeException (default) - - 'backup': reruns avoiding calibration + - 'backup': reruns by silently avoiding calibration - Parameter "recalib" has been renamed "calib" - Added aggregative bootstrap for deriving confidence regions (confidence intervals, ellipses in the simplex, or ellipses in the CLR space). This method is efficient as it leverages the two-phases of the aggregative quantifiers. This method applies resampling only to the aggregation phase, thus avoiding to train many quantifiers, or classify multiple times the instances of a sample. See: - quapy/method/confidence.py (new) - - the new example no. 15. -- BayesianCC moved to confidence.py, where methods having to do with confidence intervals live + - the new example no. 16.confidence_regions.py +- BayesianCC moved to confidence.py, where methods having to do with confidence intervals belong. +- Improved documentation of qp.plot module. Change Log 0.1.9 diff --git a/README.md b/README.md index 955cee1..730a433 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ for facilitating the analysis and interpretation of the experimental results. ### Last updates: -* Version 0.1.9 is released! major changes can be consulted [here](CHANGE_LOG.txt). -* The developer API documentation is available [here](https://hlt-isti.github.io/QuaPy/build/html/modules.html) +* Version 0.2.0 is released! major changes can be consulted [here](CHANGE_LOG.txt). +* The developer API documentation is available [here](https://hlt-isti.github.io/QuaPy/index.html) +* Manuals are available [here](https://hlt-isti.github.io/QuaPy/manuals.html) ### Installation @@ -46,12 +47,12 @@ of the test set. ```python import quapy as qp -dataset = qp.datasets.fetch_UCIBinaryDataset("yeast") -training, test = dataset.train_test +training, test = qp.datasets.fetch_UCIBinaryDataset("yeast").train_test # create an "Adjusted Classify & Count" quantifier model = qp.method.aggregative.ACC() -model.fit(training) +Xtr, ytr = training.Xy +model.fit(Xtr, ytr) estim_prevalence = model.predict(test.X) true_prevalence = test.prevalence() @@ -67,7 +68,7 @@ class prevalence of the training set. For this reason, any quantification model should be tested across many samples, even ones characterized by class prevalence values different or very different from those found in the training set. QuaPy implements sampling procedures and evaluation protocols that automate this workflow. -See the [documentation](https://hlt-isti.github.io/QuaPy/build/html/) for detailed examples. +See the [documentation](https://hlt-isti.github.io/QuaPy/manuals.html) for detailed examples. ## Features @@ -79,7 +80,8 @@ quantification methods based on structured output learning, HDy, QuaNet, quantif * 32 UCI Machine Learning datasets. * 11 Twitter quantification-by-sentiment datasets. * 3 product reviews quantification-by-sentiment datasets. - * 4 tasks from LeQua competition (_new in v0.1.7!_) + * 4 tasks from LeQua 2022 competition and 4 tasks from LeQua 2024 competition + * IFCB for Plancton quantification * Native support for binary and single-label multiclass quantification scenarios. * Model selection functionality that minimizes quantification-oriented loss functions. * Visualization tools for analysing the experimental results. @@ -100,19 +102,23 @@ In case you want to contribute improvements to quapy, please generate pull reque ## Documentation -The [developer API documentation](https://hlt-isti.github.io/QuaPy/build/html/modules.html) is available [here](https://hlt-isti.github.io/QuaPy/build/html/index.html). +Check out the [developer API documentation here](https://hlt-isti.github.io/QuaPy/index.html). -Check out our [Wiki](https://github.com/HLT-ISTI/QuaPy/wiki), in which many examples +Check out the [Manuals](https://hlt-isti.github.io/QuaPy/manuals.html), in which many code examples are provided: -* [Datasets](https://github.com/HLT-ISTI/QuaPy/wiki/Datasets) -* [Evaluation](https://github.com/HLT-ISTI/QuaPy/wiki/Evaluation) -* [Protocols](https://github.com/HLT-ISTI/QuaPy/wiki/Protocols) -* [Methods](https://github.com/HLT-ISTI/QuaPy/wiki/Methods) -* [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization) -* [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection) -* [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting) +* [Datasets](https://hlt-isti.github.io/QuaPy/manuals/datasets.html) +* [Evaluation](https://hlt-isti.github.io/QuaPy/manuals/evaluation.html) +* [Protocols](https://hlt-isti.github.io/QuaPy/manuals/protocols.html) +* [Methods](https://hlt-isti.github.io/QuaPy/manuals/methods.html) +* [SVMperf](https://hlt-isti.github.io/QuaPy/manuals/explicit-loss-minimization.html) +* [Model Selection](https://hlt-isti.github.io/QuaPy/manuals/model-selection.html) +* [Plotting](https://hlt-isti.github.io/QuaPy/manuals/plotting.html) ## Acknowledgments: SoBigData++ + +This work has been supported by the QuaDaSh project +_"Finanziato dall’Unione europea---Next Generation EU, +Missione 4 Componente 2 CUP B53D23026250001"_. diff --git a/TODO.txt b/TODO.txt index cbc7a9f..de40ed9 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,5 +1,7 @@ Adapt examples; remaining: example 4-onwards -not working: 4, 4b, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +not working: 15 (qunfold) + +Solve the warnings issue; right now there is a warning ignore in method/__init__.py: Add 'platt' to calib options in EMQ? @@ -55,3 +57,4 @@ Para quitar el labelledcollection de los métodos: - [TODO] add Friedman's method and DeBias - [TODO] check ignore warning stuff check https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings +- [TODO] nmd and md are not selectable from qp.evaluation.evaluate as a string \ No newline at end of file diff --git a/docs/build/html/_modules/index.html b/docs/build/html/_modules/index.html new file mode 100644 index 0000000..9951ba2 --- /dev/null +++ b/docs/build/html/_modules/index.html @@ -0,0 +1,124 @@ + + + + + + Overview: module code — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ + +
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/classification/calibration.html b/docs/build/html/_modules/quapy/classification/calibration.html new file mode 100644 index 0000000..6576c6c --- /dev/null +++ b/docs/build/html/_modules/quapy/classification/calibration.html @@ -0,0 +1,319 @@ + + + + + + quapy.classification.calibration — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for quapy.classification.calibration

+from copy import deepcopy
+
+from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
+from sklearn.base import BaseEstimator, clone
+from sklearn.model_selection import cross_val_predict, train_test_split
+import numpy as np
+
+
+# Wrappers of calibration defined by Alexandari et al. in paper <http://proceedings.mlr.press/v119/alexandari20a.html>
+# requires "pip install abstension"
+# see https://github.com/kundajelab/abstention
+
+
+
[docs]class RecalibratedProbabilisticClassifier: + """ + Abstract class for (re)calibration method from `abstention.calibration`, as defined in + `Alexandari, A., Kundaje, A., & Shrikumar, A. (2020, November). Maximum likelihood with bias-corrected calibration + is hard-to-beat at label shift adaptation. In International Conference on Machine Learning (pp. 222-232). PMLR. + <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + """ + pass
+ + +
[docs]class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabilisticClassifier): + """ + Applies a (re)calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_. + + + :param classifier: a scikit-learn probabilistic classifier + :param calibrator: the calibration object (an instance of abstention.calibration.CalibratorFactory) + :param val_split: indicate an integer k for performing kFCV to obtain the posterior probabilities, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer); default=None + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, calibrator, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = calibrator + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose + +
[docs] def fit(self, X, y): + """ + Fits the calibration for the probabilistic classifier. + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :param y: array-like of shape `(n_samples,)` with the class labels + :return: self + """ + k = self.val_split + if isinstance(k, int): + if k < 2: + raise ValueError('wrong value for val_split: the number of folds must be > 2') + return self.fit_cv(X, y) + elif isinstance(k, float): + if not (0 < k < 1): + raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)') + return self.fit_tr_val(X, y)
+ +
[docs] def fit_cv(self, X, y): + """ + Fits the calibration in a cross-validation manner, i.e., it generates posterior probabilities for all + training instances via cross-validation, and then retrains the classifier on all training instances. + The posterior probabilities thus generated are used for calibrating the outputs of the classifier. + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :param y: array-like of shape `(n_samples,)` with the class labels + :return: self + """ + posteriors = cross_val_predict( + self.classifier, X, y, cv=self.val_split, n_jobs=self.n_jobs, verbose=self.verbose, method='predict_proba' + ) + self.classifier.fit(X, y) + nclasses = len(np.unique(y)) + self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[y], posterior_supplied=True) + return self
+ +
[docs] def fit_tr_val(self, X, y): + """ + Fits the calibration in a train/val-split manner, i.e.t, it partitions the training instances into a + training and a validation set, and then uses the training samples to learn classifier which is then used + to generate posterior probabilities for the held-out validation data. These posteriors are used to calibrate + the classifier. The classifier is not retrained on the whole dataset. + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :param y: array-like of shape `(n_samples,)` with the class labels + :return: self + """ + Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=self.val_split, stratify=y) + self.classifier.fit(Xtr, ytr) + posteriors = self.classifier.predict_proba(Xva) + nclasses = len(np.unique(yva)) + self.calibration_function = self.calibrator(posteriors, np.eye(nclasses)[yva], posterior_supplied=True) + return self
+ +
[docs] def predict(self, X): + """ + Predicts class labels for the data instances in `X` + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :return: array-like of shape `(n_samples,)` with the class label predictions + """ + return self.classifier.predict(X)
+ +
[docs] def predict_proba(self, X): + """ + Generates posterior probabilities for the data instances in `X` + + :param X: array-like of shape `(n_samples, n_features)` with the data instances + :return: array-like of shape `(n_samples, n_classes)` with posterior probabilities + """ + posteriors = self.classifier.predict_proba(X) + return self.calibration_function(posteriors)
+ + @property + def classes_(self): + """ + Returns the classes on which the classifier has been trained on + + :return: array-like of shape `(n_classes)` + """ + return self.classifier.classes_
+ + +
[docs]class NBVSCalibration(RecalibratedProbabilisticClassifierBase): + """ + Applies the No-Bias Vector Scaling (NBVS) calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + + :param classifier: a scikit-learn probabilistic classifier + :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer) + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = NoBiasVectorScaling(verbose=verbose) + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose
+ + +
[docs]class BCTSCalibration(RecalibratedProbabilisticClassifierBase): + """ + Applies the Bias-Corrected Temperature Scaling (BCTS) calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + + :param classifier: a scikit-learn probabilistic classifier + :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer) + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = TempScaling(verbose=verbose, bias_positions='all') + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose
+ + +
[docs]class TSCalibration(RecalibratedProbabilisticClassifierBase): + """ + Applies the Temperature Scaling (TS) calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + + :param classifier: a scikit-learn probabilistic classifier + :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer) + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = TempScaling(verbose=verbose) + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose
+ + +
[docs]class VSCalibration(RecalibratedProbabilisticClassifierBase): + """ + Applies the Vector Scaling (VS) calibration method from `abstention.calibration`, as defined in + `Alexandari et al. paper <http://proceedings.mlr.press/v119/alexandari20a.html>`_: + + :param classifier: a scikit-learn probabilistic classifier + :param val_split: indicate an integer k for performing kFCV to obtain the posterior prevalences, or a float p + in (0,1) to indicate that the posteriors are obtained in a stratified validation split containing p% of the + training instances (the rest is used for training). In any case, the classifier is retrained in the whole + training set afterwards. Default value is 5. + :param n_jobs: indicate the number of parallel workers (only when val_split is an integer) + :param verbose: whether or not to display information in the standard output + """ + + def __init__(self, classifier, val_split=5, n_jobs=None, verbose=False): + self.classifier = classifier + self.calibrator = VectorScaling(verbose=verbose) + self.val_split = val_split + self.n_jobs = n_jobs + self.verbose = verbose
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/classification/methods.html b/docs/build/html/_modules/quapy/classification/methods.html new file mode 100644 index 0000000..883f802 --- /dev/null +++ b/docs/build/html/_modules/quapy/classification/methods.html @@ -0,0 +1,220 @@ + + + + + + quapy.classification.methods — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.classification.methods

+from sklearn.base import BaseEstimator
+from sklearn.decomposition import TruncatedSVD
+from sklearn.linear_model import LogisticRegression
+
+
+
+[docs] +class LowRankLogisticRegression(BaseEstimator): + """ + An example of a classification method (i.e., an object that implements `fit`, `predict`, and `predict_proba`) + that also generates embedded inputs (i.e., that implements `transform`), as those required for + :class:`quapy.method.neural.QuaNet`. This is a mock method to allow for easily instantiating + :class:`quapy.method.neural.QuaNet` on array-like real-valued instances. + The transformation consists of applying :class:`sklearn.decomposition.TruncatedSVD` + while classification is performed using :class:`sklearn.linear_model.LogisticRegression` on the low-rank space. + + :param n_components: the number of principal components to retain + :param kwargs: parameters for the + `Logistic Regression <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`__ classifier + """ + + def __init__(self, n_components=100, **kwargs): + self.n_components = n_components + self.classifier = LogisticRegression(**kwargs) + +
+[docs] + def get_params(self): + """ + Get hyper-parameters for this estimator. + + :return: a dictionary with parameter names mapped to their values + """ + params = {'n_components': self.n_components} + params.update(self.classifier.get_params()) + return params
+ + +
+[docs] + def set_params(self, **params): + """ + Set the parameters of this estimator. + + :param parameters: a `**kwargs` dictionary with the estimator parameters for + `Logistic Regression <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`__ + and eventually also `n_components` for `TruncatedSVD` + """ + params_ = dict(params) + if 'n_components' in params_: + self.n_components = params_['n_components'] + del params_['n_components'] + self.classifier.set_params(**params_)
+ + +
+[docs] + def fit(self, X, y): + """ + Fit the model according to the given training data. The fit consists of + fitting `TruncatedSVD` and then `LogisticRegression` on the low-rank representation. + + :param X: array-like of shape `(n_samples, n_features)` with the instances + :param y: array-like of shape `(n_samples, n_classes)` with the class labels + :return: `self` + """ + nF = X.shape[1] + self.pca = None + if nF > self.n_components: + self.pca = TruncatedSVD(self.n_components).fit(X) + X = self.transform(X) + self.classifier.fit(X, y) + self.classes_ = self.classifier.classes_ + return self
+ + +
+[docs] + def predict(self, X): + """ + Predicts labels for the instances `X` embedded into the low-rank space. + + :param X: array-like of shape `(n_samples, n_features)` instances to classify + :return: a `numpy` array of length `n` containing the label predictions, where `n` is the number of + instances in `X` + """ + X = self.transform(X) + return self.classifier.predict(X)
+ + +
+[docs] + def predict_proba(self, X): + """ + Predicts posterior probabilities for the instances `X` embedded into the low-rank space. + + :param X: array-like of shape `(n_samples, n_features)` instances to classify + :return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities + """ + X = self.transform(X) + return self.classifier.predict_proba(X)
+ + +
+[docs] + def transform(self, X): + """ + Returns the low-rank approximation of `X` with `n_components` dimensions, or `X` unaltered if + `n_components` >= `X.shape[1]`. + + :param X: array-like of shape `(n_samples, n_features)` instances to embed + :return: array-like of shape `(n_samples, n_components)` with the embedded instances + """ + if self.pca is None: + return X + return self.pca.transform(X)
+
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/classification/neural.html b/docs/build/html/_modules/quapy/classification/neural.html new file mode 100644 index 0000000..fd5d9b1 --- /dev/null +++ b/docs/build/html/_modules/quapy/classification/neural.html @@ -0,0 +1,715 @@ + + + + + + quapy.classification.neural — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.classification.neural

+import os
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from sklearn.metrics import accuracy_score, f1_score
+from torch.nn.utils.rnn import pad_sequence
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.util import EarlyStop
+
+
+
+[docs] +class NeuralClassifierTrainer: + """ + Trains a neural network for text classification. + + :param net: an instance of `TextClassifierNet` implementing the forward pass + :param lr: learning rate (default 1e-3) + :param weight_decay: weight decay (default 0) + :param patience: number of epochs that do not show any improvement in validation + to wait before applying early stop (default 10) + :param epochs: maximum number of training epochs (default 200) + :param batch_size: batch size for training (default 64) + :param batch_size_test: batch size for test (default 512) + :param padding_length: maximum number of tokens to consider in a document (default 300) + :param device: specify 'cpu' (default) or 'cuda' for enabling gpu + :param checkpointpath: where to store the parameters of the best model found so far + according to the evaluation in the held-out validation split (default '../checkpoint/classifier_net.dat') + """ + + def __init__(self, + net: 'TextClassifierNet', + lr=1e-3, + weight_decay=0, + patience=10, + epochs=200, + batch_size=64, + batch_size_test=512, + padding_length=300, + device='cuda', + checkpointpath='../checkpoint/classifier_net.dat'): + + super().__init__() + + assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}' + self.net = net.to(device) + self.vocab_size = self.net.vocabulary_size + self.trainer_hyperparams={ + 'lr': lr, + 'weight_decay': weight_decay, + 'patience': patience, + 'epochs': epochs, + 'batch_size': batch_size, + 'batch_size_test': batch_size_test, + 'padding_length': padding_length, + 'device': torch.device(device) + } + self.learner_hyperparams = self.net.get_params() + self.checkpointpath = checkpointpath + + print(f'[NeuralNetwork running on {device}]') + os.makedirs(Path(checkpointpath).parent, exist_ok=True) + +
+[docs] + def reset_net_params(self, vocab_size, n_classes): + """Reinitialize the network parameters + + :param vocab_size: the size of the vocabulary + :param n_classes: the number of target classes + """ + self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams) + self.net = self.net.to(self.trainer_hyperparams['device']) + self.net.xavier_uniform()
+ + +
+[docs] + def get_params(self): + """Get hyper-parameters for this estimator + + :return: a dictionary with parameter names mapped to their values + """ + return {**self.net.get_params(), **self.trainer_hyperparams}
+ + +
+[docs] + def set_params(self, **params): + """Set the parameters of this trainer and the learner it is training. + In this current version, parameter names for the trainer and learner should + be disjoint. + + :param params: a `**kwargs` dictionary with the parameters + """ + trainer_hyperparams = self.trainer_hyperparams + learner_hyperparams = self.net.get_params() + for key, val in params.items(): + if key in trainer_hyperparams and key in learner_hyperparams: + raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to ' + f'a parameters of the Trainer or the learner {self.net.__name__}') + elif key not in trainer_hyperparams and key not in learner_hyperparams: + raise ValueError(f'parameter {key} is not valid') + + if key in trainer_hyperparams: + trainer_hyperparams[key] = val + else: + learner_hyperparams[key] = val + + self.trainer_hyperparams = trainer_hyperparams + self.learner_hyperparams = learner_hyperparams
+ + + @property + def device(self): + """ Gets the device in which the network is allocated + + :return: device + """ + return next(self.net.parameters()).device + + def _train_epoch(self, data, status, pbar, epoch): + self.net.train() + criterion = torch.nn.CrossEntropyLoss() + losses, predictions, true_labels = [], [], [] + for xi, yi in data: + self.optim.zero_grad() + logits = self.net.forward(xi) + loss = criterion(logits, yi) + loss.backward() + self.optim.step() + losses.append(loss.item()) + preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1) + + status["loss"] = np.mean(losses) + predictions.extend(preds.tolist()) + true_labels.extend(yi.detach().cpu().numpy().tolist()) + status["acc"] = accuracy_score(true_labels, predictions) + status["f1"] = f1_score(true_labels, predictions, average='macro') + self.__update_progress_bar(pbar, epoch) + + def _test_epoch(self, data, status, pbar, epoch): + self.net.eval() + criterion = torch.nn.CrossEntropyLoss() + losses, predictions, true_labels = [], [], [] + with torch.no_grad(): + for xi, yi in data: + logits = self.net.forward(xi) + loss = criterion(logits, yi) + losses.append(loss.item()) + preds = torch.softmax(logits, dim=-1).detach().cpu().numpy().argmax(axis=-1) + predictions.extend(preds.tolist()) + true_labels.extend(yi.detach().cpu().numpy().tolist()) + + status["loss"] = np.mean(losses) + status["acc"] = accuracy_score(true_labels, predictions) + status["f1"] = f1_score(true_labels, predictions, average='macro') + self.__update_progress_bar(pbar, epoch) + + def __update_progress_bar(self, pbar, epoch): + pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={epoch} ' + f'tr-loss={self.status["tr"]["loss"]:.5f} ' + f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% ' + f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% ' + f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} ' + f'val-loss={self.status["va"]["loss"]:.5f} ' + f'val-acc={100 * self.status["va"]["acc"]:.2f}% ' + f'macroF1={100 * self.status["va"]["f1"]:.2f}%') + +
+[docs] + def fit(self, instances, labels, val_split=0.3): + """ + Fits the model according to the given training data. + + :param instances: list of lists of indexed tokens + :param labels: array-like of shape `(n_samples, n_classes)` with the class labels + :param val_split: proportion of training documents to be taken as the validation set (default 0.3) + :return: + """ + train, val = LabelledCollection(instances, labels).split_stratified(1-val_split) + self.classes_ = train.classes_ + opt = self.trainer_hyperparams + checkpoint = self.checkpointpath + self.reset_net_params(self.vocab_size, train.n_classes) + + train_generator = TorchDataset(train.instances, train.labels).asDataloader( + opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device']) + valid_generator = TorchDataset(val.instances, val.labels).asDataloader( + opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']) + + self.status = {'tr': {'loss': -1, 'acc': -1, 'f1': -1}, + 'va': {'loss': -1, 'acc': -1, 'f1': -1}} + + self.optim = torch.optim.Adam(self.net.parameters(), lr=opt['lr'], weight_decay=opt['weight_decay']) + self.early_stop = EarlyStop(opt['patience'], lower_is_better=False) + + with tqdm(range(1, opt['epochs'] + 1)) as pbar: + for epoch in pbar: + self._train_epoch(train_generator, self.status['tr'], pbar, epoch) + self._test_epoch(valid_generator, self.status['va'], pbar, epoch) + + self.early_stop(self.status['va']['f1'], epoch) + if self.early_stop.IMPROVED: + torch.save(self.net.state_dict(), checkpoint) + elif self.early_stop.STOP: + print(f'training ended by patience exhasted; loading best model parameters in {checkpoint} ' + f'for epoch {self.early_stop.best_epoch}') + self.net.load_state_dict(torch.load(checkpoint)) + break + + print('performing one training pass over the validation set...') + self._train_epoch(valid_generator, self.status['tr'], pbar, epoch=0) + print('[done]') + + return self
+ + +
+[docs] + def predict(self, instances): + """ + Predicts labels for the instances + + :param instances: list of lists of indexed tokens + :return: a `numpy` array of length `n` containing the label predictions, where `n` is the number of + instances in `X` + """ + return np.argmax(self.predict_proba(instances), axis=-1)
+ + +
+[docs] + def predict_proba(self, instances): + """ + Predicts posterior probabilities for the instances + + :param X: array-like of shape `(n_samples, n_features)` instances to classify + :return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities + """ + self.net.eval() + opt = self.trainer_hyperparams + with torch.no_grad(): + posteriors = [] + for xi in TorchDataset(instances).asDataloader( + opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']): + posteriors.append(self.net.predict_proba(xi)) + return np.concatenate(posteriors)
+ + +
+[docs] + def transform(self, instances): + """ + Returns the embeddings of the instances + + :param instances: list of lists of indexed tokens + :return: array-like of shape `(n_samples, embed_size)` with the embedded instances, + where `embed_size` is defined by the classification network + """ + self.net.eval() + embeddings = [] + opt = self.trainer_hyperparams + with torch.no_grad(): + for xi in TorchDataset(instances).asDataloader( + opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']): + embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy()) + return np.concatenate(embeddings)
+
+ + + +
+[docs] +class TorchDataset(torch.utils.data.Dataset): + """ + Transforms labelled instances into a Torch's :class:`torch.utils.data.DataLoader` object + + :param instances: list of lists of indexed tokens + :param labels: array-like of shape `(n_samples, n_classes)` with the class labels + """ + + def __init__(self, instances, labels=None): + self.instances = instances + self.labels = labels + + def __len__(self): + return len(self.instances) + + def __getitem__(self, index): + return {'doc': self.instances[index], 'label': self.labels[index] if self.labels is not None else None} + +
+[docs] + def asDataloader(self, batch_size, shuffle, pad_length, device): + """ + Converts the labelled collection into a Torch DataLoader with dynamic padding for + the batch + + :param batch_size: batch size + :param shuffle: whether or not to shuffle instances + :param pad_length: the maximum length for the list of tokens (dynamic padding is + applied, meaning that if the longest document in the batch is shorter than + `pad_length`, then the batch is padded up to its length, and not to `pad_length`. + :param device: whether to allocate tensors in cpu or in cuda + :return: a :class:`torch.utils.data.DataLoader` object + """ + def collate(batch): + data = [torch.LongTensor(item['doc'][:pad_length]) for item in batch] + data = pad_sequence(data, batch_first=True, padding_value=qp.environ['PAD_INDEX']).to(device) + targets = [item['label'] for item in batch] + if targets[0] is None: + return data + else: + targets = torch.as_tensor(targets, dtype=torch.long).to(device) + return [data, targets] + + torchDataset = TorchDataset(self.instances, self.labels) + return torch.utils.data.DataLoader(torchDataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate)
+
+ + + +
+[docs] +class TextClassifierNet(torch.nn.Module, metaclass=ABCMeta): + """ + Abstract Text classifier (`torch.nn.Module`) + """ + +
+[docs] + @abstractmethod + def document_embedding(self, x): + """Embeds documents (i.e., performs the forward pass up to the + next-to-last layer). + + :param x: a batch of instances, typically generated by a torch's `DataLoader` + instance (see :class:`quapy.classification.neural.TorchDataset`) + :return: a torch tensor of shape `(n_samples, n_dimensions)`, where + `n_samples` is the number of documents, and `n_dimensions` is the + dimensionality of the embedding + """ + ...
+ + +
+[docs] + def forward(self, x): + """Performs the forward pass. + + :param x: a batch of instances, typically generated by a torch's `DataLoader` + instance (see :class:`quapy.classification.neural.TorchDataset`) + :return: a tensor of shape `(n_instances, n_classes)` with the decision scores + for each of the instances and classes + """ + doc_embedded = self.document_embedding(x) + return self.output(doc_embedded)
+ + +
+[docs] + def dimensions(self): + """Gets the number of dimensions of the embedding space + + :return: integer + """ + return self.dim
+ + +
+[docs] + def predict_proba(self, x): + """ + Predicts posterior probabilities for the instances in `x` + + :param x: a torch tensor of indexed tokens with shape `(n_instances, pad_length)` + where `n_instances` is the number of instances in the batch, and `pad_length` + is length of the pad in the batch + :return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities + """ + logits = self(x) + return torch.softmax(logits, dim=1).detach().cpu().numpy()
+ + +
+[docs] + def xavier_uniform(self): + """ + Performs Xavier initialization of the network parameters + """ + for p in self.parameters(): + if p.dim() > 1 and p.requires_grad: + torch.nn.init.xavier_uniform_(p)
+ + +
+[docs] + @abstractmethod + def get_params(self): + """ + Get hyper-parameters for this estimator + + :return: a dictionary with parameter names mapped to their values + """ + ...
+ + + @property + def vocabulary_size(self): + """ + Return the size of the vocabulary + + :return: integer + """ + ...
+ + + +
+[docs] +class LSTMnet(TextClassifierNet): + """ + An implementation of :class:`quapy.classification.neural.TextClassifierNet` based on + Long Short Term Memory networks. + + :param vocabulary_size: the size of the vocabulary + :param n_classes: number of target classes + :param embedding_size: the dimensionality of the word embeddings space (default 100) + :param hidden_size: the dimensionality of the hidden space (default 256) + :param repr_size: the dimensionality of the document embeddings space (default 100) + :param lstm_class_nlayers: number of LSTM layers (default 1) + :param drop_p: drop probability for dropout (default 0.5) + """ + + def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, lstm_class_nlayers=1, + drop_p=0.5): + + super().__init__() + self.vocabulary_size_ = vocabulary_size + self.n_classes = n_classes + self.hyperparams={ + 'embedding_size': embedding_size, + 'hidden_size': hidden_size, + 'repr_size': repr_size, + 'lstm_class_nlayers': lstm_class_nlayers, + 'drop_p': drop_p + } + + self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size) + self.lstm = torch.nn.LSTM(embedding_size, hidden_size, lstm_class_nlayers, dropout=drop_p, batch_first=True) + self.dropout = torch.nn.Dropout(drop_p) + + self.dim = repr_size + self.doc_embedder = torch.nn.Linear(hidden_size, self.dim) + self.output = torch.nn.Linear(self.dim, n_classes) + + def __init_hidden(self, set_size): + opt = self.hyperparams + var_hidden = torch.zeros(opt['lstm_class_nlayers'], set_size, opt['hidden_size']) + var_cell = torch.zeros(opt['lstm_class_nlayers'], set_size, opt['hidden_size']) + if next(self.lstm.parameters()).is_cuda: + var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda() + return var_hidden, var_cell + +
+[docs] + def document_embedding(self, x): + """Embeds documents (i.e., performs the forward pass up to the + next-to-last layer). + + :param x: a batch of instances, typically generated by a torch's `DataLoader` + instance (see :class:`quapy.classification.neural.TorchDataset`) + :return: a torch tensor of shape `(n_samples, n_dimensions)`, where + `n_samples` is the number of documents, and `n_dimensions` is the + dimensionality of the embedding + """ + embedded = self.word_embedding(x) + rnn_output, rnn_hidden = self.lstm(embedded, self.__init_hidden(x.size()[0])) + abstracted = self.dropout(F.relu(rnn_hidden[0][-1])) + abstracted = self.doc_embedder(abstracted) + return abstracted
+ + +
+[docs] + def get_params(self): + """ + Get hyper-parameters for this estimator + + :return: a dictionary with parameter names mapped to their values + """ + return self.hyperparams
+ + + @property + def vocabulary_size(self): + """ + Return the size of the vocabulary + + :return: integer + """ + return self.vocabulary_size_
+ + + +
+[docs] +class CNNnet(TextClassifierNet): + """ + An implementation of :class:`quapy.classification.neural.TextClassifierNet` based on + Convolutional Neural Networks. + + :param vocabulary_size: the size of the vocabulary + :param n_classes: number of target classes + :param embedding_size: the dimensionality of the word embeddings space (default 100) + :param hidden_size: the dimensionality of the hidden space (default 256) + :param repr_size: the dimensionality of the document embeddings space (default 100) + :param kernel_heights: list of kernel lengths (default [3,5,7]), i.e., the number of + consecutive tokens that each kernel covers + :param stride: convolutional stride (default 1) + :param stride: convolutional pad (default 0) + :param drop_p: drop probability for dropout (default 0.5) + """ + + def __init__(self, vocabulary_size, n_classes, embedding_size=100, hidden_size=256, repr_size=100, + kernel_heights=[3, 5, 7], stride=1, padding=0, drop_p=0.5): + super(CNNnet, self).__init__() + + self.vocabulary_size_ = vocabulary_size + self.n_classes = n_classes + self.hyperparams={ + 'embedding_size': embedding_size, + 'hidden_size': hidden_size, + 'repr_size': repr_size, + 'kernel_heights':kernel_heights, + 'stride': stride, + 'drop_p': drop_p + } + self.word_embedding = torch.nn.Embedding(vocabulary_size, embedding_size) + in_channels = 1 + self.conv1 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[0], embedding_size), stride, padding) + self.conv2 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[1], embedding_size), stride, padding) + self.conv3 = nn.Conv2d(in_channels, hidden_size, (kernel_heights[2], embedding_size), stride, padding) + self.dropout = nn.Dropout(drop_p) + + self.dim = repr_size + self.doc_embedder = torch.nn.Linear(len(kernel_heights) * hidden_size, self.dim) + self.output = nn.Linear(self.dim, n_classes) + + def __conv_block(self, input, conv_layer): + conv_out = conv_layer(input) # conv_out.size() = (batch_size, out_channels, dim, 1) + activation = F.relu(conv_out.squeeze(3)) # activation.size() = (batch_size, out_channels, dim1) + max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels) + return max_out + +
+[docs] + def document_embedding(self, input): + """Embeds documents (i.e., performs the forward pass up to the + next-to-last layer). + + :param input: a batch of instances, typically generated by a torch's `DataLoader` + instance (see :class:`quapy.classification.neural.TorchDataset`) + :return: a torch tensor of shape `(n_samples, n_dimensions)`, where + `n_samples` is the number of documents, and `n_dimensions` is the + dimensionality of the embedding + """ + input = self.word_embedding(input) + input = input.unsqueeze(1) # input.size() = (batch_size, 1, num_seq, embedding_length) + + max_out1 = self.__conv_block(input, self.conv1) + max_out2 = self.__conv_block(input, self.conv2) + max_out3 = self.__conv_block(input, self.conv3) + + all_out = torch.cat((max_out1, max_out2, max_out3), 1) # all_out.size() = (batch_size, num_kernels*out_channels) + abstracted = self.dropout(F.relu(all_out)) # (batch_size, num_kernels*out_channels) + abstracted = self.doc_embedder(abstracted) + return abstracted
+ + +
+[docs] + def get_params(self): + """ + Get hyper-parameters for this estimator + + :return: a dictionary with parameter names mapped to their values + """ + return self.hyperparams
+ + + @property + def vocabulary_size(self): + """ + Return the size of the vocabulary + + :return: integer + """ + return self.vocabulary_size_
+ + + + + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/classification/svmperf.html b/docs/build/html/_modules/quapy/classification/svmperf.html new file mode 100644 index 0000000..959ad48 --- /dev/null +++ b/docs/build/html/_modules/quapy/classification/svmperf.html @@ -0,0 +1,268 @@ + + + + + + quapy.classification.svmperf — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.classification.svmperf

+import random
+import shutil
+import subprocess
+import tempfile
+from os import remove, makedirs
+from os.path import join, exists
+from subprocess import PIPE, STDOUT
+import numpy as np
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.datasets import dump_svmlight_file
+
+
+
+[docs] +class SVMperf(BaseEstimator, ClassifierMixin): + """A wrapper for the `SVM-perf package <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`__ by Thorsten Joachims. + When using losses for quantification, the source code has to be patched. See + the `installation documentation <https://hlt-isti.github.io/QuaPy/build/html/Installation.html#svm-perf-with-quantification-oriented-losses>`__ + for further details. + + References: + + * `Esuli et al.2015 <https://dl.acm.org/doi/abs/10.1145/2700406?casa_token=8D2fHsGCVn0AAAAA:ZfThYOvrzWxMGfZYlQW_y8Cagg-o_l6X_PcF09mdETQ4Tu7jK98mxFbGSXp9ZSO14JkUIYuDGFG0>`__ + * `Barranquero et al.2015 <https://www.sciencedirect.com/science/article/abs/pii/S003132031400291X>`__ + + :param svmperf_base: path to directory containing the binary files `svm_perf_learn` and `svm_perf_classify` + :param C: trade-off between training error and margin (default 0.01) + :param verbose: set to True to print svm-perf std outputs + :param loss: the loss to optimize for. Available losses are "01", "f1", "kld", "nkld", "q", "qacc", "qf1", "qgm", "mae", "mrae". + :param host_folder: directory where to store the trained model; set to None (default) for using a tmp directory + (temporal directories are automatically deleted) + """ + + # losses with their respective codes in svm_perf implementation + valid_losses = {'01':0, 'f1':1, 'kld':12, 'nkld':13, 'q':22, 'qacc':23, 'qf1':24, 'qgm':25, 'mae':26, 'mrae':27} + + def __init__(self, svmperf_base, C=0.01, verbose=False, loss='01', host_folder=None): + assert exists(svmperf_base), f'path {svmperf_base} does not seem to point to a valid path' + self.svmperf_base = svmperf_base + self.C = C + self.verbose = verbose + self.loss = loss + self.host_folder = host_folder + + # def set_params(self, **parameters): + # """ + # Set the hyper-parameters for svm-perf. Currently, only the `C` and `loss` parameters are supported + # + # :param parameters: a `**kwargs` dictionary `{'C': <float>}` + # """ + # assert sorted(list(parameters.keys())) == ['C', 'loss'], \ + # 'currently, only the C and loss parameters are supported' + # self.C = parameters.get('C', self.C) + # self.loss = parameters.get('loss', self.loss) + # + # def get_params(self, deep=True): + # return {'C': self.C, 'loss': self.loss} + +
+[docs] + def fit(self, X, y): + """ + Trains the SVM for the multivariate performance loss + + :param X: training instances + :param y: a binary vector of labels + :return: `self` + """ + assert self.loss in SVMperf.valid_losses, \ + f'unsupported loss {self.loss}, valid ones are {list(SVMperf.valid_losses.keys())}' + + self.svmperf_learn = join(self.svmperf_base, 'svm_perf_learn') + self.svmperf_classify = join(self.svmperf_base, 'svm_perf_classify') + self.loss_cmd = '-w 3 -l ' + str(self.valid_losses[self.loss]) + self.c_cmd = '-c ' + str(self.C) + + self.classes_ = sorted(np.unique(y)) + self.n_classes_ = len(self.classes_) + + local_random = random.Random() + # this would allow to run parallel instances of predict + random_code = 'svmperfprocess'+'-'.join(str(local_random.randint(0, 1000000)) for _ in range(5)) + if self.host_folder is None: + # tmp dir are removed after the fit terminates in multiprocessing... + self.tmpdir = tempfile.TemporaryDirectory(suffix=random_code).name + else: + self.tmpdir = join(self.host_folder, '.' + random_code) + makedirs(self.tmpdir, exist_ok=True) + + self.model = join(self.tmpdir, 'model-'+random_code) + traindat = join(self.tmpdir, f'train-{random_code}.dat') + + dump_svmlight_file(X, y, traindat, zero_based=False) + + cmd = ' '.join([self.svmperf_learn, self.c_cmd, self.loss_cmd, traindat, self.model]) + if self.verbose: + print('[Running]', cmd) + p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT) + if not exists(self.model): + print(p.stderr.decode('utf-8')) + remove(traindat) + + if self.verbose: + print(p.stdout.decode('utf-8')) + + return self
+ + +
+[docs] + def predict(self, X): + """ + Predicts labels for the instances `X` + + :param X: array-like of shape `(n_samples, n_features)` instances to classify + :return: a `numpy` array of length `n` containing the label predictions, where `n` is the number of + instances in `X` + """ + confidence_scores = self.decision_function(X) + predictions = (confidence_scores > 0) * 1 + return predictions
+ + +
+[docs] + def decision_function(self, X, y=None): + """ + Evaluate the decision function for the samples in `X`. + + :param X: array-like of shape `(n_samples, n_features)` containing the instances to classify + :param y: unused + :return: array-like of shape `(n_samples,)` containing the decision scores of the instances + """ + assert hasattr(self, 'tmpdir'), 'predict called before fit' + assert self.tmpdir is not None, 'model directory corrupted' + assert exists(self.model), 'model not found' + if y is None: + y = np.zeros(X.shape[0]) + + # in order to allow for parallel runs of predict, a random code is assigned + local_random = random.Random() + random_code = '-'.join(str(local_random.randint(0, 1000000)) for _ in range(5)) + predictions_path = join(self.tmpdir, 'predictions' + random_code + '.dat') + testdat = join(self.tmpdir, 'test' + random_code + '.dat') + dump_svmlight_file(X, y, testdat, zero_based=False) + + cmd = ' '.join([self.svmperf_classify, testdat, self.model, predictions_path]) + if self.verbose: + print('[Running]', cmd) + p = subprocess.run(cmd.split(), stdout=PIPE, stderr=STDOUT) + + if self.verbose: + print(p.stdout.decode('utf-8')) + + scores = np.loadtxt(predictions_path) + remove(testdat) + remove(predictions_path) + + return scores
+ + + def __del__(self): + if hasattr(self, 'tmpdir'): + shutil.rmtree(self.tmpdir, ignore_errors=True)
+ + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/_ifcb.html b/docs/build/html/_modules/quapy/data/_ifcb.html new file mode 100644 index 0000000..942a5e6 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/_ifcb.html @@ -0,0 +1,165 @@ + + + + + + quapy.data._ifcb — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data._ifcb

+import os
+import pandas as pd
+from quapy.protocol import AbstractProtocol
+
+
+[docs] +class IFCBTrainSamplesFromDir(AbstractProtocol): + + def __init__(self, path_dir:str, classes: list): + self.path_dir = path_dir + self.classes = classes + self.samples = [] + for filename in os.listdir(path_dir): + if filename.endswith('.csv'): + self.samples.append(filename) + + def __call__(self): + for sample in self.samples: + s = pd.read_csv(os.path.join(self.path_dir,sample)) + # all columns but the first where we get the class + X = s.iloc[:, 1:].to_numpy() + y = s.iloc[:, 0].to_numpy() + yield X, y + +
+[docs] + def total(self): + """ + Returns the total number of samples that the protocol generates. + + :return: The number of training samples to generate. + """ + return len(self.samples)
+
+ + + +
+[docs] +class IFCBTestSamples(AbstractProtocol): + + def __init__(self, path_dir:str, test_prevalences_path: str): + self.path_dir = path_dir + self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path)) + + def __call__(self): + for _, test_sample in self.test_prevalences.iterrows(): + #Load the sample from disk + X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy() + prevalences = test_sample.iloc[1:].to_numpy().astype(float) + yield X, prevalences + +
+[docs] + def total(self): + """ + Returns the total number of samples that the protocol generates. + + :return: The number of test samples to generate. + """ + return len(self.test_prevalences.index)
+
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/_lequa2022.html b/docs/build/html/_modules/quapy/data/_lequa2022.html new file mode 100644 index 0000000..f2a8fab --- /dev/null +++ b/docs/build/html/_modules/quapy/data/_lequa2022.html @@ -0,0 +1,307 @@ + + + + + + quapy.data._lequa2022 — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data._lequa2022

+from typing import Tuple, Union
+import pandas as pd
+import numpy as np
+import os
+
+from quapy.protocol import AbstractProtocol
+
+DEV_SAMPLES = 1000
+TEST_SAMPLES = 5000
+
+ERROR_TOL = 1E-3
+
+
+
+[docs] +def load_category_map(path): + cat2code = {} + with open(path, 'rt') as fin: + for line in fin: + category, code = line.split() + cat2code[category] = int(code) + code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x: x[1])] + return cat2code, code2cat
+ + + +
+[docs] +def load_raw_documents(path): + df = pd.read_csv(path) + documents = list(df["text"].values) + labels = None + if "label" in df.columns: + labels = df["label"].values.astype(int) + return documents, labels
+ + + +
+[docs] +def load_vector_documents(path): + D = pd.read_csv(path).to_numpy(dtype=float) + labelled = D.shape[1] == 301 + if labelled: + X, y = D[:, 1:], D[:, 0].astype(int).flatten() + else: + X, y = D, None + return X, y
+ + + +
+[docs] +class SamplesFromDir(AbstractProtocol): + + def __init__(self, path_dir:str, ground_truth_path:str, load_fn): + self.path_dir = path_dir + self.load_fn = load_fn + self.true_prevs = ResultSubmission.load(ground_truth_path) + + def __call__(self): + for id, prevalence in self.true_prevs.iterrows(): + sample, _ = self.load_fn(os.path.join(self.path_dir, f'{id}.txt')) + yield sample, prevalence
+ + + +
+[docs] +class ResultSubmission: + + def __init__(self): + self.df = None + + def __init_df(self, categories: int): + if not isinstance(categories, int) or categories < 2: + raise TypeError('wrong format for categories: an int (>=2) was expected') + df = pd.DataFrame(columns=list(range(categories))) + df.index.set_names('id', inplace=True) + self.df = df + + @property + def n_categories(self): + return len(self.df.columns.values) + +
+[docs] + def add(self, sample_id: int, prevalence_values: np.ndarray): + if not isinstance(sample_id, int): + raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}') + if not isinstance(prevalence_values, np.ndarray): + raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}') + if self.df is None: + self.__init_df(categories=len(prevalence_values)) + if sample_id in self.df.index.values: + raise ValueError(f'error: prevalence values for "{sample_id}" already added') + if prevalence_values.ndim != 1 and prevalence_values.size != self.n_categories: + raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}') + if (prevalence_values < 0).any() or (prevalence_values > 1).any(): + raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"') + if np.abs(prevalence_values.sum() - 1) > ERROR_TOL: + raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"' + f'(error tolerance {ERROR_TOL})') + + self.df.loc[sample_id] = prevalence_values
+ + + def __len__(self): + return len(self.df) + +
+[docs] + @classmethod + def load(cls, path: str) -> 'ResultSubmission': + df = ResultSubmission.check_file_format(path) + r = ResultSubmission() + r.df = df + return r
+ + +
+[docs] + def dump(self, path: str): + ResultSubmission.check_dataframe_format(self.df) + self.df.to_csv(path)
+ + +
+[docs] + def prevalence(self, sample_id: int): + sel = self.df.loc[sample_id] + if sel.empty: + return None + else: + return sel.values.flatten()
+ + +
+[docs] + def iterrows(self): + for index, row in self.df.iterrows(): + prevalence = row.values.flatten() + yield index, prevalence
+ + +
+[docs] + @classmethod + def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: + try: + df = pd.read_csv(path, index_col=0) + except Exception as e: + print(f'the file {path} does not seem to be a valid csv file. ') + print(e) + return ResultSubmission.check_dataframe_format(df, path=path)
+ + +
+[docs] + @classmethod + def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: + hint_path = '' # if given, show the data path in the error message + if path is not None: + hint_path = f' in {path}' + + if df.index.name != 'id' or len(df.columns) < 2: + raise ValueError(f'wrong header{hint_path}, ' + f'the format of the header should be "id,0,...,n-1", ' + f'where n is the number of categories') + if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))): + raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n-1, ' + f'where n is the number of categories') + if df.empty: + raise ValueError(f'error{hint_path}: results file is empty') + elif len(df) != DEV_SAMPLES and len(df) != TEST_SAMPLES: + raise ValueError(f'wrong number of prevalence values found{hint_path}; ' + f'expected {DEV_SAMPLES} for development sets and ' + f'{TEST_SAMPLES} for test sets; found {len(df)}') + + ids = set(df.index.values) + expected_ids = set(range(len(df))) + if ids != expected_ids: + missing = expected_ids - ids + if missing: + raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}') + unexpected = ids - expected_ids + if unexpected: + raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}') + + for category_id in df.columns: + if (df[category_id] < 0).any() or (df[category_id] > 1).any(): + raise ValueError(f'error{hint_path} column "{category_id}" contains values out of range [0,1]') + + prevs = df.values + round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ERROR_TOL + if round_errors.any(): + raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} ' + f'do not sum up to 1 (error tolerance {ERROR_TOL}), ' + f'probably due to some rounding errors.') + + return df
+
+ + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/base.html b/docs/build/html/_modules/quapy/data/base.html new file mode 100644 index 0000000..e3a2e89 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/base.html @@ -0,0 +1,728 @@ + + + + + + quapy.data.base — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data.base

+import itertools
+from functools import cached_property
+from typing import Iterable
+
+import numpy as np
+from scipy.sparse import issparse
+from scipy.sparse import vstack
+from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
+from numpy.random import RandomState
+from quapy.functional import strprev
+from quapy.util import temp_seed
+
+
+
+[docs] +class LabelledCollection: + """ + A LabelledCollection is a set of objects each with a label attached to each of them. + This class implements several sampling routines and other utilities. + + :param instances: array-like (np.ndarray, list, or csr_matrix are supported) + :param labels: array-like with the same length of instances + :param classes: optional, list of classes from which labels are taken. If not specified, the classes are inferred + from the labels. The classes must be indicated in cases in which some of the labels might have no examples + (i.e., a prevalence of 0) + """ + + def __init__(self, instances, labels, classes=None): + if issparse(instances): + self.instances = instances + elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str): + # lists of strings occupy too much as ndarrays (although python-objects add a heavy overload) + self.instances = np.asarray(instances, dtype=object) + else: + self.instances = np.asarray(instances) + self.labels = np.asarray(labels) + n_docs = len(self) + if classes is None: + self.classes_ = np.unique(self.labels) + self.classes_.sort() + else: + self.classes_ = np.unique(np.asarray(classes)) + self.classes_.sort() + if len(set(self.labels).difference(set(classes))) > 0: + raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})') + self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} + +
+[docs] + @classmethod + def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs): + """ + Loads a labelled set of data and convert it into a :class:`LabelledCollection` instance. The function in charge + of reading the instances must be specified. This function can be a custom one, or any of the reading functions + defined in :mod:`quapy.data.reader` module. + + :param path: string, the path to the file containing the labelled instances + :param loader_func: a custom function that implements the data loader and returns a tuple with instances and + labels + :param classes: array-like, the classes according to which the instances are labelled + :param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances, i.e., + these arguments are used to call `loader_func(path, **loader_kwargs)` + :return: a :class:`LabelledCollection` object + """ + return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
+ + + def __len__(self): + """ + Returns the length of this collection (number of labelled instances) + + :return: integer + """ + return self.instances.shape[0] + +
+[docs] + def prevalence(self): + """ + Returns the prevalence, or relative frequency, of the classes in the codeframe. + + :return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order + as listed by `self.classes_` + """ + return self.counts() / len(self)
+ + +
+[docs] + def counts(self): + """ + Returns the number of instances for each of the classes in the codeframe. + + :return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order + as listed by `self.classes_` + """ + return np.asarray([len(self.index[class_]) for class_ in self.classes_])
+ + + @property + def n_classes(self): + """ + The number of classes + + :return: integer + """ + return len(self.classes_) + + @property + def binary(self): + """ + Returns True if the number of classes is 2 + + :return: boolean + """ + return self.n_classes == 2 + +
+[docs] + def sampling_index(self, size, *prevs, shuffle=True, random_state=None): + """ + Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the + prevalence values are not specified, then returns the index of a uniform sampling. + For each class, the sampling is drawn with replacement if the requested prevalence is larger than + the actual prevalence of the class, or without replacement otherwise. + + :param size: integer, the requested size + :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since + it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in + `self.classes_` can be specified, while the other class takes prevalence value `1-p` + :param shuffle: if set to True (default), shuffles the index before returning it + :param random_state: seed for reproducing sampling + :return: a np.ndarray of shape `(size)` with the indexes + """ + if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling + return self.uniform_sampling_index(size, random_state=random_state) + if len(prevs) == self.n_classes - 1: + prevs = prevs + (1 - sum(prevs),) + assert len(prevs) == self.n_classes, 'unexpected number of prevalences' + assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' + + # Decide how many instances should be taken for each class in order to satisfy the requested prevalence + # accurately, and the number of instances in the sample (exactly). If int(size * prevs[i]) (which is + # <= size * prevs[i]) examples are drawn from class i, there could be a remainder number of instances to take + # to satisfy the size constrain. The remainder is distributed along the classes with probability = prevs. + # (This aims at avoiding the remainder to be placed in a class for which the prevalence requested is 0.) + n_requests = {class_: round(size * prevs[i]) for i, class_ in enumerate(self.classes_)} + remainder = size - sum(n_requests.values()) + with temp_seed(random_state): + # due to rounding, the remainder can be 0, >0, or <0 + if remainder > 0: + # when the remainder is >0 we randomly add 1 to the requests for each class; + # more prevalent classes are more likely to be taken in order to minimize the impact in the final prevalence + for rand_class in np.random.choice(self.classes_, size=remainder, p=prevs): + n_requests[rand_class] += 1 + elif remainder < 0: + # when the remainder is <0 we randomly remove 1 from the requests, unless the request is 0 for a chosen + # class; we repeat until remainder==0 + while remainder!=0: + rand_class = np.random.choice(self.classes_, p=prevs) + if n_requests[rand_class] > 0: + n_requests[rand_class] -= 1 + remainder += 1 + + indexes_sample = [] + for class_, n_requested in n_requests.items(): + n_candidates = len(self.index[class_]) + index_sample = self.index[class_][ + np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) + ] if n_requested > 0 else [] + + indexes_sample.append(index_sample) + + indexes_sample = np.concatenate(indexes_sample).astype(int) + + if shuffle: + indexes_sample = np.random.permutation(indexes_sample) + + return indexes_sample
+ + +
+[docs] + def uniform_sampling_index(self, size, random_state=None): + """ + Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn + with replacement if the requested size is greater than the number of instances, or without replacement + otherwise. + + :param size: integer, the size of the uniform sample + :param random_state: if specified, guarantees reproducibility of the split. + :return: a np.ndarray of shape `(size)` with the indexes + """ + if random_state is not None: + ng = RandomState(seed=random_state) + else: + ng = np.random + return ng.choice(len(self), size, replace=size > len(self))
+ + +
+[docs] + def sampling(self, size, *prevs, shuffle=True, random_state=None): + """ + Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence + values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than + the actual prevalence of the class, or with replacement otherwise. + + :param size: integer, the requested size + :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since + it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in + `self.classes_` can be specified, while the other class takes prevalence value `1-p` + :param shuffle: if set to True (default), shuffles the index before returning it + :param random_state: seed for reproducing sampling + :return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or + prevalence == `prevs` if the exact prevalence values can be met as proportions of instances) + """ + prev_index = self.sampling_index(size, *prevs, shuffle=shuffle, random_state=random_state) + return self.sampling_from_index(prev_index)
+ + +
+[docs] + def uniform_sampling(self, size, random_state=None): + """ + Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn + with replacement if the requested size is greater than the number of instances, or without replacement + otherwise. + + :param size: integer, the requested size + :param random_state: if specified, guarantees reproducibility of the split. + :return: an instance of :class:`LabelledCollection` with length == `size` + """ + unif_index = self.uniform_sampling_index(size, random_state=random_state) + return self.sampling_from_index(unif_index)
+ + +
+[docs] + def sampling_from_index(self, index): + """ + Returns an instance of :class:`LabelledCollection` whose elements are sampled from this collection using the + index. + + :param index: np.ndarray + :return: an instance of :class:`LabelledCollection` + """ + documents = self.instances[index] + labels = self.labels[index] + return LabelledCollection(documents, labels, classes=self.classes_)
+ + +
+[docs] + def split_stratified(self, train_prop=0.6, random_state=None): + """ + Returns two instances of :class:`LabelledCollection` split with stratification from this collection, at desired + proportion. + + :param train_prop: the proportion of elements to include in the left-most returned collection (typically used + as the training collection). The rest of elements are included in the right-most returned collection + (typically used as a test collection). + :param random_state: if specified, guarantees reproducibility of the split. + :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the + second one with `1-train_prop` elements + """ + tr_docs, te_docs, tr_labels, te_labels = train_test_split( + self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state + ) + training = LabelledCollection(tr_docs, tr_labels, classes=self.classes_) + test = LabelledCollection(te_docs, te_labels, classes=self.classes_) + return training, test
+ + +
+[docs] + def split_random(self, train_prop=0.6, random_state=None): + """ + Returns two instances of :class:`LabelledCollection` split randomly from this collection, at desired + proportion. + + :param train_prop: the proportion of elements to include in the left-most returned collection (typically used + as the training collection). The rest of elements are included in the right-most returned collection + (typically used as a test collection). + :param random_state: if specified, guarantees reproducibility of the split. + :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the + second one with `1-train_prop` elements + """ + indexes = np.random.RandomState(seed=random_state).permutation(len(self)) + if isinstance(train_prop, int): + assert train_prop < len(self), \ + 'argument train_prop cannot be greater than the number of elements in the collection' + splitpoint = train_prop + elif isinstance(train_prop, float): + assert 0 < train_prop < 1, \ + 'argument train_prop out of range (0,1)' + splitpoint = int(np.round(len(self)*train_prop)) + left, right = indexes[:splitpoint], indexes[splitpoint:] + training = self.sampling_from_index(left) + test = self.sampling_from_index(right) + return training, test
+ + + def __add__(self, other): + """ + Returns a new :class:`LabelledCollection` as the union of this collection with another collection. + Both labelled collections must have the same classes. + + :param other: another :class:`LabelledCollection` + :return: a :class:`LabelledCollection` representing the union of both collections + """ + if not all(np.sort(self.classes_)==np.sort(other.classes_)): + raise NotImplementedError(f'unsupported operation for collections on different classes; ' + f'expected {self.classes_}, found {other.classes_}') + return LabelledCollection.join(self, other) + +
+[docs] + @classmethod + def join(cls, *args: Iterable['LabelledCollection']): + """ + Returns a new :class:`LabelledCollection` as the union of the collections given in input. + + :param args: instances of :class:`LabelledCollection` + :return: a :class:`LabelledCollection` representing the union of both collections + """ + + args = [lc for lc in args if lc is not None] + assert len(args) > 0, 'empty list is not allowed for mix' + + assert all([isinstance(lc, LabelledCollection) for lc in args]), \ + 'only instances of LabelledCollection allowed' + + first_instances = args[0].instances + first_type = type(first_instances) + assert all([type(lc.instances)==first_type for lc in args[1:]]), \ + 'not all the collections are of instances of the same type' + + if issparse(first_instances) or isinstance(first_instances, np.ndarray): + first_ndim = first_instances.ndim + assert all([lc.instances.ndim == first_ndim for lc in args[1:]]), \ + 'not all the ndarrays are of the same dimension' + if first_ndim > 1: + first_shape = first_instances.shape[1:] + assert all([lc.instances.shape[1:] == first_shape for lc in args[1:]]), \ + 'not all the ndarrays are of the same shape' + if issparse(first_instances): + instances = vstack([lc.instances for lc in args]) + else: + instances = np.concatenate([lc.instances for lc in args]) + elif isinstance(first_instances, list): + instances = list(itertools.chain(lc.instances for lc in args)) + else: + raise NotImplementedError('unsupported operation for collection types') + labels = np.concatenate([lc.labels for lc in args]) + classes = np.unique(labels).sort() + return LabelledCollection(instances, labels, classes=classes)
+ + + @property + def Xy(self): + """ + Gets the instances and labels. This is useful when working with `sklearn` estimators, e.g.: + + >>> svm = LinearSVC().fit(*my_collection.Xy) + + :return: a tuple `(instances, labels)` from this collection + """ + return self.instances, self.labels + + @property + def Xp(self): + """ + Gets the instances and the true prevalence. This is useful when implementing evaluation protocols from + a :class:`LabelledCollection` object. + + :return: a tuple `(instances, prevalence)` from this collection + """ + return self.instances, self.prevalence() + + @property + def X(self): + """ + An alias to self.instances + + :return: self.instances + """ + return self.instances + + @property + def y(self): + """ + An alias to self.labels + + :return: self.labels + """ + return self.labels + + @property + def p(self): + """ + An alias to self.prevalence() + + :return: self.prevalence() + """ + return self.prevalence() + + +
+[docs] + def stats(self, show=True): + """ + Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,: + + >>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) + >>> data.training.stats() + >>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919] + + :param show: if set to True (default), prints the stats in standard output + :return: a dictionary containing some stats of this collection. Keys include `#instances` (the number of + instances), `type` (the type representing the instances), `#features` (the number of features, if the + instances are in array-like format), `#classes` (the classes of the collection), `prevs` (the prevalence + values for each class) + """ + ninstances = len(self) + instance_type = type(self.instances[0]) + if instance_type == list: + nfeats = len(self.instances[0]) + elif instance_type == np.ndarray or issparse(self.instances): + nfeats = self.instances.shape[1] + else: + nfeats = '?' + stats_ = {'instances': ninstances, + 'type': instance_type, + 'features': nfeats, + 'classes': self.classes_, + 'prevs': strprev(self.prevalence())} + if show: + print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' + f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') + return stats_
+ + +
+[docs] + def kFCV(self, nfolds=5, nrepeats=1, random_state=None): + """ + Generator of stratified folds to be used in k-fold cross validation. + + :param nfolds: integer (default 5), the number of folds to generate + :param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run + :param random_state: integer (default 0), guarantees that the folds generated are reproducible + :return: yields `nfolds * nrepeats` folds for k-fold cross validation + """ + kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) + for train_index, test_index in kf.split(*self.Xy): + train = self.sampling_from_index(train_index) + test = self.sampling_from_index(test_index) + yield train, test
+
+ + + +
+[docs] +class Dataset: + """ + Abstraction of training and test :class:`LabelledCollection` objects. + + :param training: a :class:`LabelledCollection` instance + :param test: a :class:`LabelledCollection` instance + :param vocabulary: if indicated, is a dictionary of the terms used in this textual dataset + :param name: a string representing the name of the dataset + """ + + def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''): + assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections' + self.training = training + self.test = test + self.vocabulary = vocabulary + self.name = name + +
+[docs] + @classmethod + def SplitStratified(cls, collection: LabelledCollection, train_size=0.6): + """ + Generates a :class:`Dataset` from a stratified split of a :class:`LabelledCollection` instance. + See :meth:`LabelledCollection.split_stratified` + + :param collection: :class:`LabelledCollection` + :param train_size: the proportion of training documents (the rest conforms the test split) + :return: an instance of :class:`Dataset` + """ + return Dataset(*collection.split_stratified(train_prop=train_size))
+ + + @property + def classes_(self): + """ + The classes according to which the training collection is labelled + + :return: The classes according to which the training collection is labelled + """ + return self.training.classes_ + + @property + def n_classes(self): + """ + The number of classes according to which the training collection is labelled + + :return: integer + """ + return self.training.n_classes + + @property + def binary(self): + """ + Returns True if the training collection is labelled according to two classes + + :return: boolean + """ + return self.training.binary + +
+[docs] + @classmethod + def load(cls, train_path, test_path, loader_func: callable, classes=None, **loader_kwargs): + """ + Loads a training and a test labelled set of data and convert it into a :class:`Dataset` instance. + The function in charge of reading the instances must be specified. This function can be a custom one, or any of + the reading functions defined in :mod:`quapy.data.reader` module. + + :param train_path: string, the path to the file containing the training instances + :param test_path: string, the path to the file containing the test instances + :param loader_func: a custom function that implements the data loader and returns a tuple with instances and + labels + :param classes: array-like, the classes according to which the instances are labelled + :param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances. + See :meth:`LabelledCollection.load` for further details. + :return: a :class:`Dataset` object + """ + + training = LabelledCollection.load(train_path, loader_func, classes, **loader_kwargs) + test = LabelledCollection.load(test_path, loader_func, classes, **loader_kwargs) + return Dataset(training, test)
+ + + @property + def vocabulary_size(self): + """ + If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary + + :return: integer + """ + return len(self.vocabulary) + + @property + def train_test(self): + """ + Alias to `self.training` and `self.test` + + :return: the training and test collections + :return: the training and test collections + """ + return self.training, self.test + +
+[docs] + def stats(self, show=True): + """ + Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,: + + >>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) + >>> data.stats() + >>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937] + + :param show: if set to True (default), prints the stats in standard output + :return: a dictionary containing some stats of this collection for the training and test collections. The keys + are `train` and `test`, and point to dedicated dictionaries of stats, for each collection, with keys + `#instances` (the number of instances), `type` (the type representing the instances), + `#features` (the number of features, if the instances are in array-like format), `#classes` (the classes of + the collection), `prevs` (the prevalence values for each class) + """ + tr_stats = self.training.stats(show=False) + te_stats = self.test.stats(show=False) + if show: + print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, ' + f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' + f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') + return {'train': tr_stats, 'test': te_stats}
+ + +
+[docs] + @classmethod + def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0): + """ + Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around + :meth:`LabelledCollection.kFCV` that returns :class:`Dataset` instances made of training and test folds. + + :param nfolds: integer (default 5), the number of folds to generate + :param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run + :param random_state: integer (default 0), guarantees that the folds generated are reproducible + :return: yields `nfolds * nrepeats` folds for k-fold cross validation as instances of :class:`Dataset` + """ + for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)): + yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
+ + + +
+[docs] + def reduce(self, n_train=100, n_test=100): + """ + Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set. + + :param n_train: number of training documents to keep (default 100) + :param n_test: number of test documents to keep (default 100) + :return: self + """ + self.training = self.training.sampling(n_train, *self.training.prevalence()) + self.test = self.test.sampling(n_test, *self.test.prevalence()) + return self
+
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/datasets.html b/docs/build/html/_modules/quapy/data/datasets.html new file mode 100644 index 0000000..b910036 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/datasets.html @@ -0,0 +1,919 @@ + + + + + + quapy.data.datasets — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data.datasets

+
[docs]def warn(*args, **kwargs): + pass
+import warnings +warnings.warn = warn +import os +import zipfile +from os.path import join +import pandas as pd +from ucimlrepo import fetch_ucirepo +from quapy.data.base import Dataset, LabelledCollection +from quapy.data.preprocessing import text2tfidf, reduce_columns +from quapy.data.reader import * +from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource + + +REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] +TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders', + 'semeval13', 'semeval14', 'semeval15', 'semeval16', + 'sst', 'wa', 'wb'] +TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', + 'semeval', 'semeval16', + 'sst', 'wa', 'wb'] +UCI_BINARY_DATASETS = ['acute.a', 'acute.b', + 'balance.1', 'balance.2', 'balance.3', + 'breast-cancer', + 'cmc.1', 'cmc.2', 'cmc.3', + 'ctg.1', 'ctg.2', 'ctg.3', + #'diabetes', # <-- I haven't found this one... + 'german', + 'haberman', + 'ionosphere', + 'iris.1', 'iris.2', 'iris.3', + 'mammographic', + 'pageblocks.5', + #'phoneme', # <-- I haven't found this one... + 'semeion', + 'sonar', + 'spambase', + 'spectf', + 'tictactoe', + 'transfusion', + 'wdbc', + 'wine.1', 'wine.2', 'wine.3', + 'wine-q-red', 'wine-q-white', + 'yeast'] + +UCI_MULTICLASS_DATASETS = ['dry-bean', + 'wine-quality', + 'academic-success', + 'digits', + 'letter'] + +LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] + +_TXA_SAMPLE_SIZE = 250 +_TXB_SAMPLE_SIZE = 1000 + +LEQUA2022_SAMPLE_SIZE = { + 'TXA': _TXA_SAMPLE_SIZE, + 'TXB': _TXB_SAMPLE_SIZE, + 'T1A': _TXA_SAMPLE_SIZE, + 'T1B': _TXB_SAMPLE_SIZE, + 'T2A': _TXA_SAMPLE_SIZE, + 'T2B': _TXB_SAMPLE_SIZE, + 'binary': _TXA_SAMPLE_SIZE, + 'multiclass': _TXB_SAMPLE_SIZE +} + + +
[docs]def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset: + """ + Loads a Reviews dataset as a Dataset instance, as used in + `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." + Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_. + The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS` + + :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb' + :param tfidf: set to True to transform the raw documents into tfidf weighted matrices + :param min_df: minimun number of documents that should contain a term in order for the term to be + kept (ignored if tfidf==False) + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for + faster subsequent invokations + :return: a :class:`quapy.data.base.Dataset` instance + """ + assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \ + f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \ + f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt' + URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt' + os.makedirs(join(data_home, 'reviews'), exist_ok=True) + train_path = join(data_home, 'reviews', dataset_name, 'train.txt') + test_path = join(data_home, 'reviews', dataset_name, 'test.txt') + download_file_if_not_exists(URL_TRAIN, train_path) + download_file_if_not_exists(URL_TEST, test_path) + + pickle_path = None + if pickle: + pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl') + data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text) + + if tfidf: + text2tfidf(data, inplace=True) + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) + + data.name = dataset_name + + return data
+ + +
[docs]def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset: + """ + Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in: + `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. + Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_ + Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set. + The list of valid dataset names corresponding to training sets can be accessed in + `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in + `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST` + + :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13', + 'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb' + :param for_model_selection: if True, then returns the train split as the training set and the devel split + as the test set; if False, then returns the train+devel split as the training set and the test set as the + test set + :param min_df: minimun number of documents that should contain a term in order for the term to be kept + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for + faster subsequent invokations + :return: a :class:`quapy.data.base.Dataset` instance + """ + assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \ + f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \ + f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \ + f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \ + f'a common training set "semeval")' + if data_home is None: + data_home = get_quapy_home() + + URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip' + unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam') + if not os.path.exists(unzipped_path): + downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip') + download_file(URL, downloaded_path) + with zipfile.ZipFile(downloaded_path) as file: + file.extractall(data_home) + os.remove(downloaded_path) + + if dataset_name in {'semeval13', 'semeval14', 'semeval15'}: + trainset_name = 'semeval' + testset_name = 'semeval' if for_model_selection else dataset_name + print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common " + f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}") + else: + if dataset_name == 'semeval' and for_model_selection==False: + raise ValueError('dataset "semeval" can only be used for model selection. ' + 'Use "semeval13", "semeval14", or "semeval15" for model evaluation.') + trainset_name = testset_name = dataset_name + + if for_model_selection: + train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt') + test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt') + else: + train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt') + if dataset_name == 'semeval16': # there is a different test name in the case of semeval16 only + test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt') + else: + test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt') + + pickle_path = None + if pickle: + mode = "train-dev" if for_model_selection else "train+dev-test" + pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl') + data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse) + + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) + + data.name = dataset_name + + return data
+ + +
[docs]def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: + """ + Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in + `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). + Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. + Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_ + and + `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further + information on how to use these collections), and so a train-test split is generated at desired proportion. + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS` + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :return: a :class:`quapy.data.base.Dataset` instance + """ + data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose) + return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+ + +
[docs]def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: + """ + Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in + `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). + Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. + Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_ + and + `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation + protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation. + This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.: + + >>> import quapy as qp + >>> collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast") + >>> for data in qp.train.Dataset.kFCV(collection, nfolds=5, nrepeats=2): + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS` + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :return: a :class:`quapy.data.base.LabelledCollection` instance + """ + + assert dataset_name in UCI_BINARY_DATASETS, \ + f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ + f'Valid ones are {UCI_BINARY_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + dataset_fullname = { + 'acute.a': 'Acute Inflammations (urinary bladder)', + 'acute.b': 'Acute Inflammations (renal pelvis)', + 'balance.1': 'Balance Scale Weight & Distance Database (left)', + 'balance.2': 'Balance Scale Weight & Distance Database (balanced)', + 'balance.3': 'Balance Scale Weight & Distance Database (right)', + 'breast-cancer': 'Breast Cancer Wisconsin (Original)', + 'cmc.1': 'Contraceptive Method Choice (no use)', + 'cmc.2': 'Contraceptive Method Choice (long term)', + 'cmc.3': 'Contraceptive Method Choice (short term)', + 'ctg.1': 'Cardiotocography Data Set (normal)', + 'ctg.2': 'Cardiotocography Data Set (suspect)', + 'ctg.3': 'Cardiotocography Data Set (pathologic)', + 'german': 'Statlog German Credit Data', + 'haberman': "Haberman's Survival Data", + 'ionosphere': 'Johns Hopkins University Ionosphere DB', + 'iris.1': 'Iris Plants Database(x)', + 'iris.2': 'Iris Plants Database(versicolour)', + 'iris.3': 'Iris Plants Database(virginica)', + 'mammographic': 'Mammographic Mass', + 'pageblocks.5': 'Page Blocks Classification (5)', + 'semeion': 'Semeion Handwritten Digit (8)', + 'sonar': 'Sonar, Mines vs. Rocks', + 'spambase': 'Spambase Data Set', + 'spectf': 'SPECTF Heart Data', + 'tictactoe': 'Tic-Tac-Toe Endgame Database', + 'transfusion': 'Blood Transfusion Service Center Data Set', + 'wdbc': 'Wisconsin Diagnostic Breast Cancer', + 'wine.1': 'Wine Recognition Data (1)', + 'wine.2': 'Wine Recognition Data (2)', + 'wine.3': 'Wine Recognition Data (3)', + 'wine-q-red': 'Wine Quality Red (6-10)', + 'wine-q-white': 'Wine Quality White (6-10)', + 'yeast': 'Yeast', + } + + # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use + # to download the raw dataset + identifier_map = { + 'acute.a': 'acute', + 'acute.b': 'acute', + 'balance.1': 'balance-scale', + 'balance.2': 'balance-scale', + 'balance.3': 'balance-scale', + 'breast-cancer': 'breast-cancer-wisconsin', + 'cmc.1': 'cmc', + 'cmc.2': 'cmc', + 'cmc.3': 'cmc', + 'ctg.1': '00193', + 'ctg.2': '00193', + 'ctg.3': '00193', + 'german': 'statlog/german', + 'haberman': 'haberman', + 'ionosphere': 'ionosphere', + 'iris.1': 'iris', + 'iris.2': 'iris', + 'iris.3': 'iris', + 'mammographic': 'mammographic-masses', + 'pageblocks.5': 'page-blocks', + 'semeion': 'semeion', + 'sonar': 'undocumented/connectionist-bench/sonar', + 'spambase': 'spambase', + 'spectf': 'spect', + 'tictactoe': 'tic-tac-toe', + 'transfusion': 'blood-transfusion', + 'wdbc': 'breast-cancer-wisconsin', + 'wine-q-red': 'wine-quality', + 'wine-q-white': 'wine-quality', + 'wine.1': 'wine', + 'wine.2': 'wine', + 'wine.3': 'wine', + 'yeast': 'yeast', + } + + # the filename is the name of the file within the data_folder indexed by the identifier + file_name = { + 'acute': 'diagnosis.data', + '00193': 'CTG.xls', + 'statlog/german': 'german.data-numeric', + 'mammographic-masses': 'mammographic_masses.data', + 'page-blocks': 'page-blocks.data.Z', + 'undocumented/connectionist-bench/sonar': 'sonar.all-data', + 'spect': ['SPECTF.train', 'SPECTF.test'], + 'blood-transfusion': 'transfusion.data', + 'wine-quality': ['winequality-red.csv', 'winequality-white.csv'], + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data' + } + + # the filename containing the dataset description (if any) + desc_name = { + 'acute': 'diagnosis.names', + '00193': None, + 'statlog/german': 'german.doc', + 'mammographic-masses': 'mammographic_masses.names', + 'undocumented/connectionist-bench/sonar': 'sonar.names', + 'spect': 'SPECTF.names', + 'blood-transfusion': 'transfusion.names', + 'wine-quality': 'winequality.names', + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names' + } + + identifier = identifier_map[dataset_name] + filename = file_name.get(identifier, f'{identifier}.data') + descfile = desc_name.get(identifier, f'{identifier}.names') + fullname = dataset_fullname[dataset_name] + + URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}' + data_dir = join(data_home, 'uci_datasets', identifier) + if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later + data_path = join(data_dir, filename) + download_file_if_not_exists(f'{URL}/{filename}', data_path) + + if descfile: + try: + download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}') + if verbose: + print(open(f'{data_dir}/{descfile}', 'rt').read()) + except Exception: + print('could not read the description file') + elif verbose: + print('no file description available') + + if verbose: + print(f'Loading {dataset_name} ({fullname})') + if identifier == 'acute': + df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t') + + df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False) + [_df_replace(df, col) for col in range(1, 6)] + X = df.loc[:, 0:5].values + if dataset_name == 'acute.a': + y = binarize(df[6], pos_class='yes') + elif dataset_name == 'acute.b': + y = binarize(df[7], pos_class='yes') + + if identifier == 'balance-scale': + df = pd.read_csv(data_path, header=None, sep=',') + if dataset_name == 'balance.1': + y = binarize(df[0], pos_class='L') + elif dataset_name == 'balance.2': + y = binarize(df[0], pos_class='B') + elif dataset_name == 'balance.3': + y = binarize(df[0], pos_class='R') + X = df.loc[:, 1:].astype(float).values + + if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer': + df = pd.read_csv(data_path, header=None, sep=',') + Xy = df.loc[:, 1:10] + Xy[Xy=='?']=np.nan + Xy = Xy.dropna(axis=0) + X = Xy.loc[:, 1:9] + X = X.astype(float).values + y = binarize(Xy[10], pos_class=2) + + if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.loc[:, 2:32].astype(float).values + y = df[1].values + y = binarize(y, pos_class='M') + + if identifier == 'cmc': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.loc[:, 0:8].astype(float).values + y = df[9].astype(int).values + if dataset_name == 'cmc.1': + y = binarize(y, pos_class=1) + elif dataset_name == 'cmc.2': + y = binarize(y, pos_class=2) + elif dataset_name == 'cmc.3': + y = binarize(y, pos_class=3) + + if identifier == '00193': + df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3) + df = df[list(range(1,24))] # select columns numbered (number 23 is the target label) + # replaces the header with the first row + new_header = df.iloc[0] # grab the first row for the header + df = df[1:] # take the data less the header row + df.columns = new_header # set the header row as the df header + X = df.iloc[:, 0:22].astype(float).values + y = df['NSP'].astype(int).values + if dataset_name == 'ctg.1': + y = binarize(y, pos_class=1) # 1==Normal + elif dataset_name == 'ctg.2': + y = binarize(y, pos_class=2) # 2==Suspect + elif dataset_name == 'ctg.3': + y = binarize(y, pos_class=3) # 3==Pathologic + + if identifier == 'statlog/german': + df = pd.read_csv(data_path, header=None, delim_whitespace=True) + X = df.iloc[:, 0:24].astype(float).values + y = df[24].astype(int).values + y = binarize(y, pos_class=1) + + if identifier == 'haberman': + df = pd.read_csv(data_path, header=None) + X = df.iloc[:, 0:3].astype(float).values + y = df[3].astype(int).values + y = binarize(y, pos_class=2) + + if identifier == 'ionosphere': + df = pd.read_csv(data_path, header=None) + X = df.iloc[:, 0:34].astype(float).values + y = df[34].values + y = binarize(y, pos_class='b') + + if identifier == 'iris': + df = pd.read_csv(data_path, header=None) + X = df.iloc[:, 0:4].astype(float).values + y = df[4].values + if dataset_name == 'iris.1': + y = binarize(y, pos_class='Iris-setosa') # 1==Setosa + elif dataset_name == 'iris.2': + y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor + elif dataset_name == 'iris.3': + y = binarize(y, pos_class='Iris-virginica') # 3==Virginica + + if identifier == 'mammographic-masses': + df = pd.read_csv(data_path, header=None, sep=',') + df[df == '?'] = np.nan + Xy = df.dropna(axis=0) + X = Xy.iloc[:, 0:5] + X = X.astype(float).values + y = binarize(Xy.iloc[:,5], pos_class=1) + + if identifier == 'page-blocks': + data_path_ = data_path.replace('.Z', '') + if not os.path.exists(data_path_): + raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you ' + f'attempt to load this dataset, then you have to manually unzip the {data_path} ' + f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor ' + f'gzip can handle unix compressed files automatically -- there is a repo in GitHub ' + f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).') + df = pd.read_csv(data_path_, header=None, delim_whitespace=True) + X = df.iloc[:, 0:10].astype(float).values + y = df[10].values + y = binarize(y, pos_class=5) # 5==block "graphic" + + if identifier == 'semeion': + df = pd.read_csv(data_path, header=None, delim_whitespace=True ) + X = df.iloc[:, 0:256].astype(float).values + y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266) + y = binarize(y, pos_class=1) + + if identifier == 'undocumented/connectionist-bench/sonar': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 0:60].astype(float).values + y = df[60].values + y = binarize(y, pos_class='R') + + if identifier == 'spambase': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 0:57].astype(float).values + y = df[57].values + y = binarize(y, pos_class=1) + + if identifier == 'spect': + dfs = [] + for file in filename: + data_path = join(data_dir, file) + download_file_if_not_exists(f'{URL}/{file}', data_path) + dfs.append(pd.read_csv(data_path, header=None, sep=',')) + df = pd.concat(dfs) + X = df.iloc[:, 1:45].astype(float).values + y = df[0].values + y = binarize(y, pos_class=0) + + if identifier == 'tic-tac-toe': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values + y = df[9].values + y = binarize(y, pos_class='negative') + + if identifier == 'blood-transfusion': + df = pd.read_csv(data_path, sep=',') + X = df.iloc[:, 0:4].astype(float).values + y = df.iloc[:, 4].values + y = binarize(y, pos_class=1) + + if identifier == 'wine': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 1:14].astype(float).values + y = df[0].values + if dataset_name == 'wine.1': + y = binarize(y, pos_class=1) + elif dataset_name == 'wine.2': + y = binarize(y, pos_class=2) + elif dataset_name == 'wine.3': + y = binarize(y, pos_class=3) + + if identifier == 'wine-quality': + filename = filename[0] if dataset_name=='wine-q-red' else filename[1] + data_path = join(data_dir, filename) + download_file_if_not_exists(f'{URL}/{filename}', data_path) + df = pd.read_csv(data_path, sep=';') + X = df.iloc[:, 0:11].astype(float).values + y = df.iloc[:, 11].values > 5 + + if identifier == 'yeast': + df = pd.read_csv(data_path, header=None, delim_whitespace=True) + X = df.iloc[:, 1:9].astype(float).values + y = df.iloc[:, 9].values + y = binarize(y, pos_class='NUC') + + data = LabelledCollection(X, y) + if verbose: + data.stats() + return data
+ + +
[docs]def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: + """ + Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. + + The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: + - It has more than 1000 instances + - It is suited for classification + - It has more than two classes + - It is available for Python import (requires ucimlrepo package) + + >>> import quapy as qp + >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean") + >>> train, test = dataset.train_test + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + + The datasets are downloaded only once and pickled into disk, saving time for consecutive calls. + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (stats) about the dataset + :return: a :class:`quapy.data.base.Dataset` instance + """ + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) + return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+ + +
[docs]def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: + """ + Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. + + The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: + - It has more than 1000 instances + - It is suited for classification + - It has more than two classes + - It is available for Python import (requires ucimlrepo package) + + >>> import quapy as qp + >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") + >>> X, y = collection.Xy + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + + The datasets are downloaded only once and pickled into disk, saving time for consecutive calls. + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (stats) about the dataset + :return: a :class:`quapy.data.base.LabelledCollection` instance + """ + assert dataset_name in UCI_MULTICLASS_DATASETS, \ + f'Name {dataset_name} does not match any known dataset from the ' \ + f'UCI Machine Learning datasets repository (multiclass). ' \ + f'Valid ones are {UCI_MULTICLASS_DATASETS}' + + if data_home is None: + data_home = get_quapy_home() + + identifiers = { + "dry-bean": 602, + "wine-quality": 186, + "academic-success": 697, + "digits": 80, + "letter": 59 + } + + full_names = { + "dry-bean": "Dry Bean Dataset", + "wine-quality": "Wine Quality", + "academic-success": "Predict students' dropout and academic success", + "digits": "Optical Recognition of Handwritten Digits", + "letter": "Letter Recognition" + } + + identifier = identifiers[dataset_name] + fullname = full_names[dataset_name] + + if verbose: + print(f'Loading UCI Muticlass {dataset_name} ({fullname})') + + file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') + + def download(id): + data = fetch_ucirepo(id=id) + X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() + classes = np.sort(np.unique(y)) + y = np.searchsorted(classes, y) + return LabelledCollection(X, y) + + data = pickled_resource(file, download, identifier) + + if verbose: + data.stats() + + return data
+ + +def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): + df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) + + +
[docs]def fetch_lequa2022(task, data_home=None): + """ + Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition. + In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification + problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead. + Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification + problems consisting of estimating the class prevalence values of 28 different merchandise products. + We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022). + A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify. + <https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description + on the tasks and datasets. + + The datasets are downloaded only once, and stored for fast reuse. + + See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these + datasets. + + + :param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of + :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of + :class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`, + that return a series of samples stored in a directory which are labelled by prevalence. + """ + + from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir + + assert task in LEQUA2022_TASKS, \ + f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}' + if data_home is None: + data_home = get_quapy_home() + + URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip' + URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip' + URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip' + + lequa_dir = join(data_home, 'lequa2022') + os.makedirs(lequa_dir, exist_ok=True) + + def download_unzip_and_remove(unzipped_path, url): + tmp_path = join(lequa_dir, task + '_tmp.zip') + download_file_if_not_exists(url, tmp_path) + with zipfile.ZipFile(tmp_path) as file: + file.extractall(unzipped_path) + os.remove(tmp_path) + + if not os.path.exists(join(lequa_dir, task)): + download_unzip_and_remove(lequa_dir, URL_TRAINDEV) + download_unzip_and_remove(lequa_dir, URL_TEST) + download_unzip_and_remove(lequa_dir, URL_TEST_PREV) + + if task in ['T1A', 'T1B']: + load_fn = load_vector_documents + elif task in ['T2A', 'T2B']: + load_fn = load_raw_documents + + tr_path = join(lequa_dir, task, 'public', 'training_data.txt') + train = LabelledCollection.load(tr_path, loader_func=load_fn) + + val_samples_path = join(lequa_dir, task, 'public', 'dev_samples') + val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt') + val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn) + + test_samples_path = join(lequa_dir, task, 'public', 'test_samples') + test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt') + test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn) + + return train, val_gen, test_gen
+ + +
[docs]def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): + """ + Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more + information on this dataset, please follow the zenodo link). + This dataset is based on the data available publicly at + `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_. + The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_. + Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms. + + The datasets are downloaded only once, and stored for fast reuse. + + :param single_sample_train: a boolean. If true, it will return the train dataset as a + :class:`quapy.data.base.LabelledCollection` (all examples together). + If false, a generator of training samples will be returned. Each example in the training set has an individual label. + :param for_model_selection: if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection; + if False, then returns the full training set as training set and the test set as the test set + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :return: a tuple `(train, test_gen)` where `train` is an instance of + :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is true or + :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir`, i.e. a sampling protocol that returns a series of samples + labelled example by example. test_gen will be a :class:`quapy.data._ifcb.IFCBTestSamples`, + i.e., a sampling protocol that returns a series of samples labelled by prevalence. + """ + + from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples, get_sample_list, generate_modelselection_split + + if data_home is None: + data_home = get_quapy_home() + + URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip' + URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip' + URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip' + + ifcb_dir = join(data_home, 'ifcb') + os.makedirs(ifcb_dir, exist_ok=True) + + def download_unzip_and_remove(unzipped_path, url): + tmp_path = join(ifcb_dir, 'ifcb_tmp.zip') + download_file_if_not_exists(url, tmp_path) + with zipfile.ZipFile(tmp_path) as file: + file.extractall(unzipped_path) + os.remove(tmp_path) + + if not os.path.exists(os.path.join(ifcb_dir,'train')): + download_unzip_and_remove(ifcb_dir, URL_TRAIN) + if not os.path.exists(os.path.join(ifcb_dir,'test')): + download_unzip_and_remove(ifcb_dir, URL_TEST) + if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')): + download_unzip_and_remove(ifcb_dir, URL_TEST_PREV) + + # Load test prevalences and classes + test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv') + test_true_prev = pd.read_csv(test_true_prev_path) + classes = test_true_prev.columns[1:] + + #Load train and test samples + train_samples_path = join(ifcb_dir,'train') + test_samples_path = join(ifcb_dir,'test') + + if for_model_selection: + # In this case, return 70% of training data as the training set and 30% as the test set + samples = get_sample_list(train_samples_path) + train, test = generate_modelselection_split(samples, split=0.3) + train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train) + + # Test prevalence is computed from class labels + test_gen = IFCBTestSamples(path_dir=train_samples_path, test_prevalences=None, samples=test, classes=classes) + else: + # In this case, we use all training samples as the training set and the test samples as the test set + train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes) + test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences=test_true_prev) + + # In the case the user wants it, join all the train samples in one LabelledCollection + if single_sample_train: + train = LabelledCollection.join(*[lc for lc in train_gen()]) + return train, test_gen + else: + return train_gen, test_gen
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/preprocessing.html b/docs/build/html/_modules/quapy/data/preprocessing.html new file mode 100644 index 0000000..a50aa64 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/preprocessing.html @@ -0,0 +1,373 @@ + + + + + + quapy.data.preprocessing — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data.preprocessing

+import numpy as np
+from scipy.sparse import spmatrix
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.data.base import Dataset
+from quapy.util import map_parallel
+from .base import LabelledCollection
+
+
+
+[docs] +def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): + """ + Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of + tfidf weighted sparse vectors + + :param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are + lists of str + :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary (default 3) + :param sublinear_tf: whether or not to apply the log scalling to the tf counters (default True) + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :param kwargs: the rest of parameters of the transformation (as for sklearn's + `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_) + :return: a new :class:`quapy.data.base.Dataset` in `csr_matrix` format (if inplace=False) or a reference to the + current Dataset (if inplace=True) where the instances are stored in a `csr_matrix` of real-valued tfidf scores + """ + __check_type(dataset.training.instances, np.ndarray, str) + __check_type(dataset.test.instances, np.ndarray, str) + + vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs) + training_documents = vectorizer.fit_transform(dataset.training.instances) + test_documents = vectorizer.transform(dataset.test.instances) + + if inplace: + dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_) + dataset.vocabulary = vectorizer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_) + return Dataset(training, test, vectorizer.vocabulary_)
+ + + +
+[docs] +def reduce_columns(dataset: Dataset, min_df=5, inplace=False): + """ + Reduces the dimensionality of the instances, represented as a `csr_matrix` (or any subtype of + `scipy.sparse.spmatrix`), of training and test documents by removing the columns of words which are not present + in at least `min_df` instances in the training set + + :param dataset: a :class:`quapy.data.base.Dataset` in which instances are represented in sparse format (any + subtype of scipy.sparse.spmatrix) + :param min_df: integer, minimum number of instances below which the columns are removed + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current + :class:`quapy.data.base.Dataset` (inplace=True) where the dimensions corresponding to infrequent terms + in the training set have been removed + """ + __check_type(dataset.training.instances, spmatrix) + __check_type(dataset.test.instances, spmatrix) + assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces' + + def filter_by_occurrences(X, W): + column_prevalence = np.asarray((X > 0).sum(axis=0)).flatten() + take_columns = column_prevalence >= min_df + X = X[:, take_columns] + W = W[:, take_columns] + return X, W + + Xtr, Xte = filter_by_occurrences(dataset.training.instances, dataset.test.instances) + if inplace: + dataset.training.instances = Xtr + dataset.test.instances = Xte + return dataset + else: + training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.classes_) + return Dataset(training, test)
+ + + +
+[docs] +def standardize(dataset: Dataset, inplace=False): + """ + Standardizes the real-valued columns of a :class:`quapy.data.base.Dataset`. + Standardization, aka z-scoring, of a variable `X` comes down to subtracting the average and normalizing by the + standard deviation. + + :param dataset: a :class:`quapy.data.base.Dataset` object + :param inplace: set to True if the transformation is to be applied inplace, or to False (default) if a new + :class:`quapy.data.base.Dataset` is to be returned + :return: an instance of :class:`quapy.data.base.Dataset` + """ + s = StandardScaler(copy=not inplace) + training = s.fit_transform(dataset.training.instances) + test = s.transform(dataset.test.instances) + if inplace: + return dataset + else: + return Dataset(training, test, dataset.vocabulary, dataset.name)
+ + + +
+[docs] +def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): + """ + Indexes the tokens of a textual :class:`quapy.data.base.Dataset` of string documents. + To index a document means to replace each different token by a unique numerical index. + Rare words (i.e., words occurring less than `min_df` times) are replaced by a special token `UNK` + + :param dataset: a :class:`quapy.data.base.Dataset` object where the instances of training and test documents + are lists of str + :param min_df: minimum number of occurrences below which the term is replaced by a `UNK` index + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :param kwargs: the rest of parameters of the transformation (as for sklearn's + `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>_`) + :return: a new :class:`quapy.data.base.Dataset` (if inplace=False) or a reference to the current + :class:`quapy.data.base.Dataset` (inplace=True) consisting of lists of integer values representing indices. + """ + __check_type(dataset.training.instances, np.ndarray, str) + __check_type(dataset.test.instances, np.ndarray, str) + + indexer = IndexTransformer(min_df=min_df, **kwargs) + training_index = indexer.fit_transform(dataset.training.instances) + test_index = indexer.transform(dataset.test.instances) + + training_index = np.asarray(training_index, dtype=object) + test_index = np.asarray(test_index, dtype=object) + + if inplace: + dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.classes_) + dataset.vocabulary = indexer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.classes_) + return Dataset(training, test, indexer.vocabulary_)
+ + + +def __check_type(container, container_type=None, element_type=None): + if container_type: + assert isinstance(container, container_type), \ + f'unexpected type of container (expected {container_type}, found {type(container)})' + if element_type: + assert isinstance(container[0], element_type), \ + f'unexpected type of element (expected {container_type}, found {type(container)})' + + +
+[docs] +class IndexTransformer: + """ + This class implements a sklearn's-style transformer that indexes text as numerical ids for the tokens it + contains, and that would be generated by sklearn's + `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_ + + :param kwargs: keyworded arguments from + `CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_ + """ + + def __init__(self, **kwargs): + self.vect = CountVectorizer(**kwargs) + self.unk = -1 # a valid index is assigned after fit + self.pad = -2 # a valid index is assigned after fit + +
+[docs] + def fit(self, X): + """ + Fits the transformer, i.e., decides on the vocabulary, given a list of strings. + + :param X: a list of strings + :return: self + """ + self.vect.fit(X) + self.analyzer = self.vect.build_analyzer() + self.vocabulary_ = self.vect.vocabulary_ + self.unk = self.add_word(qp.environ['UNK_TOKEN'], qp.environ['UNK_INDEX']) + self.pad = self.add_word(qp.environ['PAD_TOKEN'], qp.environ['PAD_INDEX']) + return self
+ + +
+[docs] + def transform(self, X, n_jobs=None): + """ + Transforms the strings in `X` as lists of numerical ids + + :param X: a list of strings + :param n_jobs: the number of parallel workers to carry out this task + :return: a `np.ndarray` of numerical ids + """ + # given the number of tasks and the number of jobs, generates the slices for the parallel processes + assert self.unk != -1, 'transform called before fit' + n_jobs = qp._get_njobs(n_jobs) + return map_parallel(func=self._index, args=X, n_jobs=n_jobs)
+ + + + def _index(self, documents): + vocab = self.vocabulary_.copy() + return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] + +
+[docs] + def fit_transform(self, X, n_jobs=None): + """ + Fits the transform on `X` and transforms it. + + :param X: a list of strings + :param n_jobs: the number of parallel workers to carry out this task + :return: a `np.ndarray` of numerical ids + """ + return self.fit(X).transform(X, n_jobs=n_jobs)
+ + +
+[docs] + def vocabulary_size(self): + """ + Gets the length of the vocabulary according to which the document tokens have been indexed + + :return: integer + """ + return len(self.vocabulary_)
+ + +
+[docs] + def add_word(self, word, id=None, nogaps=True): + """ + Adds a new token (regardless of whether it has been found in the text or not), with dedicated id. + Useful to define special tokens for codifying unknown words, or padding tokens. + + :param word: string, surface form of the token + :param id: integer, numerical value to assign to the token (leave as None for indicating the next valid id, + default) + :param nogaps: if set to True (default) asserts that the id indicated leads to no numerical gaps with + precedent ids stored so far + :return: integer, the numerical id for the new token + """ + if word in self.vocabulary_: + raise ValueError(f'word {word} already in dictionary') + if id is None: + # add the word with the next id + self.vocabulary_[word] = len(self.vocabulary_) + else: + id2word = {id_:word_ for word_, id_ in self.vocabulary_.items()} + if id in id2word: + old_word = id2word[id] + self.vocabulary_[word] = id + del self.vocabulary_[old_word] + self.add_word(old_word) + elif nogaps: + if id > self.vocabulary_size()+1: + raise ValueError(f'word {word} added with id {id}, while the current vocabulary size ' + f'is of {self.vocabulary_size()}, and id gaps are not allowed') + return self.vocabulary_[word]
+
+ + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/data/reader.html b/docs/build/html/_modules/quapy/data/reader.html new file mode 100644 index 0000000..4c9c163 --- /dev/null +++ b/docs/build/html/_modules/quapy/data/reader.html @@ -0,0 +1,244 @@ + + + + + + quapy.data.reader — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.data.reader

+import numpy as np
+from scipy.sparse import dok_matrix
+from tqdm import tqdm
+
+
+
+[docs] +def from_text(path, encoding='utf-8', verbose=1, class2int=True): + """ + Reads a labelled colletion of documents. + File fomart <0 or 1>\t<document>\n + + :param path: path to the labelled collection + :param encoding: the text encoding used to open the file + :param verbose: if >0 (default) shows some progress information in standard output + :return: a list of sentences, and a list of labels + """ + all_sentences, all_labels = [], [] + if verbose>0: + file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}') + else: + file = open(path, 'rt', encoding=encoding).readlines() + for line in file: + line = line.strip() + if line: + try: + label, sentence = line.split('\t') + sentence = sentence.strip() + if class2int: + label = int(label) + if sentence: + all_sentences.append(sentence) + all_labels.append(label) + except ValueError: + print(f'format error in {line}') + return all_sentences, all_labels
+ + + +
+[docs] +def from_sparse(path): + """ + Reads a labelled collection of real-valued instances expressed in sparse format + File format <-1 or 0 or 1>[\s col(int):val(float)]\n + + :param path: path to the labelled collection + :return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels + """ + + def split_col_val(col_val): + col, val = col_val.split(':') + col, val = int(col) - 1, float(val) + return col, val + + all_documents, all_labels = [], [] + max_col = 0 + for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'): + parts = line.strip().split() + if parts: + all_labels.append(int(parts[0])) + cols, vals = zip(*[split_col_val(col_val) for col_val in parts[1:]]) + cols, vals = np.asarray(cols), np.asarray(vals) + max_col = max(max_col, cols.max()) + all_documents.append((cols, vals)) + n_docs = len(all_labels) + X = dok_matrix((n_docs, max_col + 1), dtype=float) + for i, (cols, vals) in tqdm(enumerate(all_documents), total=len(all_documents), + desc=f'\-- filling matrix of shape {X.shape}'): + X[i, cols] = vals + X = X.tocsr() + y = np.asarray(all_labels) + 1 + return X, y
+ + + +
+[docs] +def from_csv(path, encoding='utf-8'): + """ + Reads a csv file in which columns are separated by ','. + File format <label>,<feat1>,<feat2>,...,<featn>\n + + :param path: path to the csv file + :param encoding: the text encoding used to open the file + :return: a np.ndarray for the labels and a ndarray (float) for the covariates + """ + + X, y = [], [] + for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'): + yi, *xi = instance.strip().split(',') + X.append(list(map(float,xi))) + y.append(yi) + X = np.asarray(X) + y = np.asarray(y) + return X, y
+ + + +
+[docs] +def reindex_labels(y): + """ + Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes. + E.g.: + + >>> reindex_labels(['B', 'B', 'A', 'C']) + >>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1')) + + :param y: the list or array of original labels + :return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes. + """ + y = np.asarray(y) + classnames = np.asarray(sorted(np.unique(y))) + label2index = {label: index for index, label in enumerate(classnames)} + indexed = np.empty(y.shape, dtype=int) + for label in classnames: + indexed[y==label] = label2index[label] + return indexed, classnames
+ + + +
+[docs] +def binarize(y, pos_class): + """ + Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,: + + >>> binarize([1, 2, 3, 1, 1, 0], pos_class=2) + >>> array([0, 1, 0, 0, 0, 0]) + + :param y: array-like of labels + :param pos_class: integer, the positive class + :return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and + 0 otherwise + """ + y = np.asarray(y) + ybin = np.zeros(y.shape, dtype=int) + ybin[y == pos_class] = 1 + return ybin
+ + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/error.html b/docs/build/html/_modules/quapy/error.html new file mode 100644 index 0000000..1613468 --- /dev/null +++ b/docs/build/html/_modules/quapy/error.html @@ -0,0 +1,433 @@ + + + + + + quapy.error — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.error

+"""Implementation of error measures used for quantification"""
+
+import numpy as np
+from sklearn.metrics import f1_score
+import quapy as qp
+
+
+
[docs]def from_name(err_name): + """Gets an error function from its name. E.g., `from_name("mae")` + will return function :meth:`quapy.error.mae` + + :param err_name: string, the error name + :return: a callable implementing the requested error + """ + assert err_name in ERROR_NAMES, f'unknown error {err_name}' + callable_error = globals()[err_name] + return callable_error
+ + +
[docs]def f1e(y_true, y_pred): + """F1 error: simply computes the error in terms of macro :math:`F_1`, i.e., + :math:`1-F_1^M`, where :math:`F_1` is the harmonic mean of precision and recall, + defined as :math:`\\frac{2tp}{2tp+fp+fn}`, with `tp`, `fp`, and `fn` standing + for true positives, false positives, and false negatives, respectively. + `Macro` averaging means the :math:`F_1` is computed for each category independently, + and then averaged. + + :param y_true: array-like of true labels + :param y_pred: array-like of predicted labels + :return: :math:`1-F_1^M` + """ + return 1. - f1_score(y_true, y_pred, average='macro')
+ + +
[docs]def acce(y_true, y_pred): + """Computes the error in terms of 1-accuracy. The accuracy is computed as + :math:`\\frac{tp+tn}{tp+fp+fn+tn}`, with `tp`, `fp`, `fn`, and `tn` standing + for true positives, false positives, false negatives, and true negatives, + respectively + + :param y_true: array-like of true labels + :param y_pred: array-like of predicted labels + :return: 1-accuracy + """ + return 1. - (y_true == y_pred).mean()
+ + +
[docs]def mae(prevs, prevs_hat): + """Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs. + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :return: mean absolute error + """ + return ae(prevs, prevs_hat).mean()
+ + +
[docs]def ae(prevs, prevs_hat): + """Computes the absolute error between the two prevalence vectors. + Absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as + :math:`AE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}|\\hat{p}(y)-p(y)|`, + where :math:`\\mathcal{Y}` are the classes of interest. + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :return: absolute error + """ + assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs).mean(axis=-1)
+ + +
[docs]def nae(prevs, prevs_hat): + """Computes the normalized absolute error between the two prevalence vectors. + Normalized absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as + :math:`NAE(p,\\hat{p})=\\frac{AE(p,\\hat{p})}{z_{AE}}`, + where :math:`z_{AE}=\\frac{2(1-\\min_{y\\in \\mathcal{Y}} p(y))}{|\\mathcal{Y}|}`, and :math:`\\mathcal{Y}` + are the classes of interest. + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :return: normalized absolute error + """ + assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs).sum(axis=-1)/(2*(1-prevs.min(axis=-1)))
+ + +
[docs]def mnae(prevs, prevs_hat): + """Computes the mean normalized absolute error (see :meth:`quapy.error.nae`) across the sample pairs. + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :return: mean normalized absolute error + """ + return nae(prevs, prevs_hat).mean()
+ + +
[docs]def mse(prevs, prevs_hat): + """Computes the mean squared error (see :meth:`quapy.error.se`) across the sample pairs. + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the + true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the + predicted prevalence values + :return: mean squared error + """ + return se(prevs, prevs_hat).mean()
+ + +
[docs]def se(prevs, prevs_hat): + """Computes the squared error between the two prevalence vectors. + Squared error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as + :math:`SE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}(\\hat{p}(y)-p(y))^2`, + where + :math:`\\mathcal{Y}` are the classes of interest. + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :return: absolute error + """ + return ((prevs_hat - prevs) ** 2).mean(axis=-1)
+ + +
[docs]def mkld(prevs, prevs_hat, eps=None): + """Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the + sample pairs. The distributions are smoothed using the `eps` factor + (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :param eps: smoothing factor. KLD is not defined in cases in which the distributions contain + zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. + If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE` + (which has thus to be set beforehand). + :return: mean Kullback-Leibler distribution + """ + return kld(prevs, prevs_hat, eps).mean()
+ + +
[docs]def kld(prevs, prevs_hat, eps=None): + """Computes the Kullback-Leibler divergence between the two prevalence distributions. + Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` + is computed as + :math:`KLD(p,\\hat{p})=D_{KL}(p||\\hat{p})= + \\sum_{y\\in \\mathcal{Y}} p(y)\\log\\frac{p(y)}{\\hat{p}(y)}`, + where :math:`\\mathcal{Y}` are the classes of interest. + The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :param eps: smoothing factor. KLD is not defined in cases in which the distributions contain + zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. + If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE` + (which has thus to be set beforehand). + :return: Kullback-Leibler divergence between the two distributions + """ + eps = __check_eps(eps) + smooth_prevs = prevs + eps + smooth_prevs_hat = prevs_hat + eps + return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)
+ + +
[docs]def mnkld(prevs, prevs_hat, eps=None): + """Computes the mean Normalized Kullback-Leibler divergence (see :meth:`quapy.error.nkld`) + across the sample pairs. The distributions are smoothed using the `eps` factor + (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain + zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. + If `eps=None`, the sample size will be taken from the environment variable `SAMPLE_SIZE` + (which has thus to be set beforehand). + :return: mean Normalized Kullback-Leibler distribution + """ + return nkld(prevs, prevs_hat, eps).mean()
+ + +
[docs]def nkld(prevs, prevs_hat, eps=None): + """Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions. + Normalized Kullback-Leibler divergence between two prevalence distributions :math:`p` and + :math:`\\hat{p}` is computed as + math:`NKLD(p,\\hat{p}) = 2\\frac{e^{KLD(p,\\hat{p})}}{e^{KLD(p,\\hat{p})}+1}-1`, + where + :math:`\\mathcal{Y}` are the classes of interest. + The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :param eps: smoothing factor. NKLD is not defined in cases in which the distributions + contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample + size. If `eps=None`, the sample size will be taken from the environment variable + `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: Normalized Kullback-Leibler divergence between the two distributions + """ + ekld = np.exp(kld(prevs, prevs_hat, eps)) + return 2. * ekld / (1 + ekld) - 1.
+ + +
[docs]def mrae(prevs, prevs_hat, eps=None): + """Computes the mean relative absolute error (see :meth:`quapy.error.rae`) across + the sample pairs. The distributions are smoothed using the `eps` factor (see + :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :param eps: smoothing factor. `mrae` is not defined in cases in which the true + distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, + with :math:`T` the sample size. If `eps=None`, the sample size will be taken from + the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: mean relative absolute error + """ + return rae(prevs, prevs_hat, eps).mean()
+ + +
[docs]def rae(prevs, prevs_hat, eps=None): + """Computes the absolute relative error between the two prevalence vectors. + Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` + is computed as + :math:`RAE(p,\\hat{p})= + \\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}\\frac{|\\hat{p}(y)-p(y)|}{p(y)}`, + where :math:`\\mathcal{Y}` are the classes of interest. + The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :param eps: smoothing factor. `rae` is not defined in cases in which the true distribution + contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the + sample size. If `eps=None`, the sample size will be taken from the environment variable + `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: relative absolute error + """ + eps = __check_eps(eps) + prevs = smooth(prevs, eps) + prevs_hat = smooth(prevs_hat, eps) + return (abs(prevs - prevs_hat) / prevs).mean(axis=-1)
+ + +
[docs]def nrae(prevs, prevs_hat, eps=None): + """Computes the normalized absolute relative error between the two prevalence vectors. + Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` + is computed as + :math:`NRAE(p,\\hat{p})= \\frac{RAE(p,\\hat{p})}{z_{RAE}}`, + where + :math:`z_{RAE} = \\frac{|\\mathcal{Y}|-1+\\frac{1-\\min_{y\\in \\mathcal{Y}} p(y)}{\\min_{y\\in \\mathcal{Y}} p(y)}}{|\\mathcal{Y}|}` + and :math:`\\mathcal{Y}` are the classes of interest. + The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values + :param eps: smoothing factor. `nrae` is not defined in cases in which the true distribution + contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the + sample size. If `eps=None`, the sample size will be taken from the environment variable + `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: normalized relative absolute error + """ + eps = __check_eps(eps) + prevs = smooth(prevs, eps) + prevs_hat = smooth(prevs_hat, eps) + min_p = prevs.min(axis=-1) + return (abs(prevs - prevs_hat) / prevs).sum(axis=-1)/(prevs.shape[-1]-1+(1-min_p)/min_p)
+ + +
[docs]def mnrae(prevs, prevs_hat, eps=None): + """Computes the mean normalized relative absolute error (see :meth:`quapy.error.nrae`) across + the sample pairs. The distributions are smoothed using the `eps` factor (see + :meth:`quapy.error.smooth`). + + :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + prevalence values + :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted + prevalence values + :param eps: smoothing factor. `mnrae` is not defined in cases in which the true + distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, + with :math:`T` the sample size. If `eps=None`, the sample size will be taken from + the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). + :return: mean normalized relative absolute error + """ + return nrae(prevs, prevs_hat, eps).mean()
+ + +
[docs]def smooth(prevs, eps): + """ Smooths a prevalence distribution with :math:`\\epsilon` (`eps`) as: + :math:`\\underline{p}(y)=\\frac{\\epsilon+p(y)}{\\epsilon|\\mathcal{Y}|+ + \\displaystyle\\sum_{y\\in \\mathcal{Y}}p(y)}` + + :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param eps: smoothing factor + :return: array-like of shape `(n_classes,)` with the smoothed distribution + """ + n_classes = prevs.shape[-1] + return (prevs + eps) / (eps * n_classes + 1)
+ + +def __check_eps(eps=None): + if eps is None: + sample_size = qp.environ['SAMPLE_SIZE'] + if sample_size is None: + raise ValueError('eps was not defined, and qp.environ["SAMPLE_SIZE"] was not set') + eps = 1. / (2. * sample_size) + return eps + + +CLASSIFICATION_ERROR = {f1e, acce} +QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld} +QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld} +QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae} +CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR} +QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR} +QUANTIFICATION_ERROR_SINGLE_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR_SINGLE} +QUANTIFICATION_ERROR_SMOOTH_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR_SMOOTH} +ERROR_NAMES = \ + CLASSIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_NAMES | QUANTIFICATION_ERROR_SINGLE_NAMES + +f1_error = f1e +acc_error = acce +mean_absolute_error = mae +absolute_error = ae +mean_relative_absolute_error = mrae +relative_absolute_error = rae +normalized_absolute_error = nae +normalized_relative_absolute_error = nrae +mean_normalized_absolute_error = mnae +mean_normalized_relative_absolute_error = mnrae +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/evaluation.html b/docs/build/html/_modules/quapy/evaluation.html new file mode 100644 index 0000000..56d34a5 --- /dev/null +++ b/docs/build/html/_modules/quapy/evaluation.html @@ -0,0 +1,291 @@ + + + + + + quapy.evaluation — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.evaluation

+from typing import Union, Callable, Iterable
+import numpy as np
+from tqdm import tqdm
+import quapy as qp
+from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol, IterateProtocol
+from quapy.method.base import BaseQuantifier
+import pandas as pd
+
+
+
[docs]def prediction( + model: BaseQuantifier, + protocol: AbstractProtocol, + aggr_speedup: Union[str, bool] = 'auto', + verbose=False): + """ + Uses a quantification model to generate predictions for the samples generated via a specific protocol. + This function is central to all evaluation processes, and is endowed with an optimization to speed-up the + prediction of protocols that generate samples from a large collection. The optimization applies to aggregative + quantifiers only, and to OnLabelledCollectionProtocol protocols, and comes down to generating the classification + predictions once and for all, and then generating samples over the classification predictions (instead of over + the raw instances), so that the classifier prediction is never called again. This behaviour is obtained by + setting `aggr_speedup` to 'auto' or True, and is only carried out if the overall process is convenient in terms + of computations (e.g., if the number of classification predictions needed for the original collection exceed the + number of classification predictions needed for all samples, then the optimization is not undertaken). + + :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier` + :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of + :class:`quapy.protocol.OnLabelledCollectionProtocol`, then the aggregation speed-up can be run. This is the protocol + in charge of generating the samples for which the model has to issue class prevalence predictions. + :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of + instances in the original collection on which the protocol acts is larger than the number of instances + in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is + convenient or not. Set to False to deactivate. + :param verbose: boolean, show or not information in stdout + :return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape + `(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample + """ + assert aggr_speedup in [False, True, 'auto', 'force'], 'invalid value for aggr_speedup' + + sout = lambda x: print(x) if verbose else None + + apply_optimization = False + + if aggr_speedup in [True, 'auto', 'force']: + # checks whether the prediction can be made more efficiently; this check consists in verifying if the model is + # of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to + # classify using the protocol would exceed the number of test documents in the original collection + from quapy.method.aggregative import AggregativeQuantifier + if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol): + if aggr_speedup == 'force': + apply_optimization = True + sout(f'forcing aggregative speedup') + elif hasattr(protocol, 'sample_size'): + nD = len(protocol.get_labelled_collection()) + samplesD = protocol.total() * protocol.sample_size + if nD < samplesD: + apply_optimization = True + sout(f'speeding up the prediction for the aggregative quantifier, ' + f'total classifications {nD} instead of {samplesD}') + + if apply_optimization: + pre_classified = model.classify(protocol.get_labelled_collection().instances) + protocol_with_predictions = protocol.on_preclassified_instances(pre_classified) + return __prediction_helper(model.aggregate, protocol_with_predictions, verbose) + else: + return __prediction_helper(model.quantify, protocol, verbose)
+ + +def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False): + true_prevs, estim_prevs = [], [] + for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total(), desc='predicting') if verbose else protocol(): + estim_prevs.append(quantification_fn(sample_instances)) + true_prevs.append(sample_prev) + + true_prevs = np.asarray(true_prevs) + estim_prevs = np.asarray(estim_prevs) + + return true_prevs, estim_prevs + + +
[docs]def evaluation_report(model: BaseQuantifier, + protocol: AbstractProtocol, + error_metrics: Iterable[Union[str,Callable]] = 'mae', + aggr_speedup: Union[str, bool] = 'auto', + verbose=False): + """ + Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according + to a specific protocol and in terms of one or more evaluation metrics (errors). + + + :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier` + :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of + :class:`quapy.protocol.OnLabelledCollectionProtocol`, then the aggregation speed-up can be run. This is the protocol + in charge of generating the samples in which the model is evaluated. + :param error_metrics: a string, or list of strings, representing the name(s) of an error function in `qp.error` + (e.g., 'mae', the default value), or a callable function, or a list of callable functions, implementing + the error function itself. + :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of + instances in the original collection on which the protocol acts is larger than the number of instances + in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is + convenient or not. Set to False to deactivate. + :param verbose: boolean, show or not information in stdout + :return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample), + 'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics + have been indicated, each displaying the score in terms of that metric for every sample. + """ + + true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose) + return _prevalence_report(true_prevs, estim_prevs, error_metrics)
+ + +def _prevalence_report(true_prevs, estim_prevs, error_metrics: Iterable[Union[str, Callable]] = 'mae'): + + if isinstance(error_metrics, str): + error_metrics = [error_metrics] + + error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics] + assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions' + error_names = [e.__name__ for e in error_funcs] + + row_entries = [] + for true_prev, estim_prev in zip(true_prevs, estim_prevs): + series = {'true-prev': true_prev, 'estim-prev': estim_prev} + for error_name, error_metric in zip(error_names, error_funcs): + score = error_metric(true_prev, estim_prev) + series[error_name] = score + row_entries.append(series) + + df = pd.DataFrame.from_records(row_entries) + return df + + +
[docs]def evaluate( + model: BaseQuantifier, + protocol: AbstractProtocol, + error_metric: Union[str, Callable], + aggr_speedup: Union[str, bool] = 'auto', + verbose=False): + """ + Evaluates a quantification model according to a specific sample generation protocol and in terms of one + evaluation metric (error). + + :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier` + :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of + :class:`quapy.protocol.OnLabelledCollectionProtocol`, then the aggregation speed-up can be run. This is the + protocol in charge of generating the samples in which the model is evaluated. + :param error_metric: a string representing the name(s) of an error function in `qp.error` + (e.g., 'mae'), or a callable function implementing the error function itself. + :param aggr_speedup: whether or not to apply the speed-up. Set to "force" for applying it even if the number of + instances in the original collection on which the protocol acts is larger than the number of instances + in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is + convenient or not. Set to False to deactivate. + :param verbose: boolean, show or not information in stdout + :return: if the error metric is not averaged (e.g., 'ae', 'rae'), returns an array of shape `(n_samples,)` with + the error scores for each sample; if the error metric is averaged (e.g., 'mae', 'mrae') then returns + a single float + """ + + if isinstance(error_metric, str): + error_metric = qp.error.from_name(error_metric) + true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose) + return error_metric(true_prevs, estim_prevs)
+ + +
[docs]def evaluate_on_samples( + model: BaseQuantifier, + samples: Iterable[qp.data.LabelledCollection], + error_metric: Union[str, Callable], + verbose=False): + """ + Evaluates a quantification model on a given set of samples and in terms of one evaluation metric (error). + + :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier` + :param samples: a list of samples on which the quantifier is to be evaluated + :param error_metric: a string representing the name(s) of an error function in `qp.error` + (e.g., 'mae'), or a callable function implementing the error function itself. + :param verbose: boolean, show or not information in stdout + :return: if the error metric is not averaged (e.g., 'ae', 'rae'), returns an array of shape `(n_samples,)` with + the error scores for each sample; if the error metric is averaged (e.g., 'mae', 'mrae') then returns + a single float + """ + + return evaluate(model, IterateProtocol(samples), error_metric, aggr_speedup=False, verbose=verbose)
+ + + + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/functional.html b/docs/build/html/_modules/quapy/functional.html new file mode 100644 index 0000000..1b02248 --- /dev/null +++ b/docs/build/html/_modules/quapy/functional.html @@ -0,0 +1,468 @@ + + + + + + quapy.functional — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.functional

+import itertools
+from collections import defaultdict
+from typing import Union, Callable
+
+import scipy
+import numpy as np
+
+
+
[docs]def prevalence_linspace(n_prevalences=21, repeats=1, smooth_limits_epsilon=0.01): + """ + Produces an array of uniformly separated values of prevalence. + By default, produces an array of 21 prevalence values, with + step 0.05 and with the limits smoothed, i.e.: + [0.01, 0.05, 0.10, 0.15, ..., 0.90, 0.95, 0.99] + + :param n_prevalences: the number of prevalence values to sample from the [0,1] interval (default 21) + :param repeats: number of times each prevalence is to be repeated (defaults to 1) + :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 + :return: an array of uniformly separated prevalence values + """ + p = np.linspace(0., 1., num=n_prevalences, endpoint=True) + p[0] += smooth_limits_epsilon + p[-1] -= smooth_limits_epsilon + if p[0] > p[1]: + raise ValueError(f'the smoothing in the limits is greater than the prevalence step') + if repeats > 1: + p = np.repeat(p, repeats) + return p
+ + +
[docs]def prevalence_from_labels(labels, classes): + """ + Computed the prevalence values from a vector of labels. + + :param labels: array-like of shape `(n_instances)` with the label for each instance + :param classes: the class labels. This is needed in order to correctly compute the prevalence vector even when + some classes have no examples. + :return: an ndarray of shape `(len(classes))` with the class prevalence values + """ + if labels.ndim != 1: + raise ValueError(f'param labels does not seem to be a ndarray of label predictions') + unique, counts = np.unique(labels, return_counts=True) + by_class = defaultdict(lambda:0, dict(zip(unique, counts))) + prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=float) + prevalences /= prevalences.sum() + return prevalences
+ + +
[docs]def prevalence_from_probabilities(posteriors, binarize: bool = False): + """ + Returns a vector of prevalence values from a matrix of posterior probabilities. + + :param posteriors: array-like of shape `(n_instances, n_classes,)` with posterior probabilities for each class + :param binarize: set to True (default is False) for computing the prevalence values on crisp decisions (i.e., + converting the vectors of posterior probabilities into class indices, by taking the argmax). + :return: array of shape `(n_classes,)` containing the prevalence values + """ + if posteriors.ndim != 2: + raise ValueError(f'param posteriors does not seem to be a ndarray of posteior probabilities') + if binarize: + predictions = np.argmax(posteriors, axis=-1) + return prevalence_from_labels(predictions, np.arange(posteriors.shape[1])) + else: + prevalences = posteriors.mean(axis=0) + prevalences /= prevalences.sum() + return prevalences
+ + +
[docs]def as_binary_prevalence(positive_prevalence: Union[float, np.ndarray], clip_if_necessary=False): + """ + Helper that, given a float representing the prevalence for the positive class, returns a np.ndarray of two + values representing a binary distribution. + + :param positive_prevalence: prevalence for the positive class + :param clip_if_necessary: if True, clips the value in [0,1] in order to guarantee the resulting distribution + is valid. If False, it then checks that the value is in the valid range, and raises an error if not. + :return: np.ndarray of shape `(2,)` + """ + if clip_if_necessary: + positive_prevalence = np.clip(positive_prevalence, 0, 1) + else: + assert 0 <= positive_prevalence <= 1, 'the value provided is not a valid prevalence for the positive class' + return np.asarray([1-positive_prevalence, positive_prevalence]).T
+ + + +
[docs]def HellingerDistance(P, Q) -> float: + """ + Computes the Hellingher Distance (HD) between (discretized) distributions `P` and `Q`. + The HD for two discrete distributions of `k` bins is defined as: + + .. math:: + HD(P,Q) = \\frac{ 1 }{ \\sqrt{ 2 } } \\sqrt{ \\sum_{i=1}^k ( \\sqrt{p_i} - \\sqrt{q_i} )^2 } + + :param P: real-valued array-like of shape `(k,)` representing a discrete distribution + :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution + :return: float + """ + return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
+ + +
[docs]def TopsoeDistance(P, Q, epsilon=1e-20): + """ + Topsoe distance between two (discretized) distributions `P` and `Q`. + The Topsoe distance for two discrete distributions of `k` bins is defined as: + + .. math:: + Topsoe(P,Q) = \\sum_{i=1}^k \\left( p_i \\log\\left(\\frac{ 2 p_i + \\epsilon }{ p_i+q_i+\\epsilon }\\right) + + q_i \\log\\left(\\frac{ 2 q_i + \\epsilon }{ p_i+q_i+\\epsilon }\\right) \\right) + + :param P: real-valued array-like of shape `(k,)` representing a discrete distribution + :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution + :return: float + """ + return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
+ + +
[docs]def uniform_prevalence_sampling(n_classes, size=1): + """ + Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_ + for sampling uniformly at random from the unit simplex. This implementation is adapted from this + `post <https://cs.stackexchange.com/questions/3227/uniform-sampling-from-a-simplex>_`. + + :param n_classes: integer, number of classes (dimensionality of the simplex) + :param size: number of samples to return + :return: `np.ndarray` of shape `(size, n_classes,)` if `size>1`, or of shape `(n_classes,)` otherwise + """ + if n_classes == 2: + u = np.random.rand(size) + u = np.vstack([1-u, u]).T + else: + u = np.random.rand(size, n_classes-1) + u.sort(axis=-1) + _0s = np.zeros(shape=(size, 1)) + _1s = np.ones(shape=(size, 1)) + a = np.hstack([_0s, u]) + b = np.hstack([u, _1s]) + u = b-a + if size == 1: + u = u.flatten() + return u
+ + +uniform_simplex_sampling = uniform_prevalence_sampling + + +
[docs]def strprev(prevalences, prec=3): + """ + Returns a string representation for a prevalence vector. E.g., + + >>> strprev([1/3, 2/3], prec=2) + >>> '[0.33, 0.67]' + + :param prevalences: a vector of prevalence values + :param prec: float precision + :return: string + """ + return '['+ ', '.join([f'{p:.{prec}f}' for p in prevalences]) + ']'
+ + +
[docs]def adjusted_quantification(prevalence_estim, tpr, fpr, clip=True): + """ + Implements the adjustment of ACC and PACC for the binary case. The adjustment for a prevalence estimate of the + positive class `p` comes down to computing: + + .. math:: + ACC(p) = \\frac{ p - fpr }{ tpr - fpr } + + :param prevalence_estim: float, the estimated value for the positive class + :param tpr: float, the true positive rate of the classifier + :param fpr: float, the false positive rate of the classifier + :param clip: set to True (default) to clip values that might exceed the range [0,1] + :return: float, the adjusted count + """ + + den = tpr - fpr + if den == 0: + den += 1e-8 + adjusted = (prevalence_estim - fpr) / den + if clip: + adjusted = np.clip(adjusted, 0., 1.) + return adjusted
+ + +
[docs]def normalize_prevalence(prevalences): + """ + Normalize a vector or matrix of prevalence values. The normalization consists of applying a L1 normalization in + cases in which the prevalence values are not all-zeros, and to convert the prevalence values into `1/n_classes` in + cases in which all values are zero. + + :param prevalences: array-like of shape `(n_classes,)` or of shape `(n_samples, n_classes,)` with prevalence values + :return: a normalized vector or matrix of prevalence values + """ + prevalences = np.asarray(prevalences) + n_classes = prevalences.shape[-1] + accum = prevalences.sum(axis=-1, keepdims=True) + prevalences = np.true_divide(prevalences, accum, where=accum>0) + allzeros = accum.flatten()==0 + if any(allzeros): + if prevalences.ndim == 1: + prevalences = np.full(shape=n_classes, fill_value=1./n_classes) + else: + prevalences[accum.flatten()==0] = np.full(shape=n_classes, fill_value=1./n_classes) + return prevalences
+ + +def __num_prevalence_combinations_depr(n_prevpoints:int, n_classes:int, n_repeats:int=1): + """ + Computes the number of prevalence combinations in the n_classes-dimensional simplex if `nprevpoints` equally distant + prevalence values are generated and `n_repeats` repetitions are requested. + + :param n_classes: integer, number of classes + :param n_prevpoints: integer, number of prevalence points. + :param n_repeats: integer, number of repetitions for each prevalence combination + :return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the + number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] + """ + __cache={} + def __f(nc,np): + if (nc,np) in __cache: # cached result + return __cache[(nc,np)] + if nc==1: # stop condition + return 1 + else: # recursive call + x = sum([__f(nc-1, np-i) for i in range(np)]) + __cache[(nc,np)] = x + return x + return __f(n_classes, n_prevpoints) * n_repeats + + +
[docs]def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1): + """ + Computes the number of valid prevalence combinations in the n_classes-dimensional simplex if `n_prevpoints` equally + distant prevalence values are generated and `n_repeats` repetitions are requested. + The computation comes down to calculating: + + .. math:: + \\binom{N+C-1}{C-1} \\times r + + where `N` is `n_prevpoints-1`, i.e., the number of probability mass blocks to allocate, `C` is the number of + classes, and `r` is `n_repeats`. This solution comes from the + `Stars and Bars <https://brilliant.org/wiki/integer-equations-star-and-bars/>`_ problem. + + :param n_classes: integer, number of classes + :param n_prevpoints: integer, number of prevalence points. + :param n_repeats: integer, number of repetitions for each prevalence combination + :return: The number of possible combinations. For example, if n_classes=2, n_prevpoints=5, n_repeats=1, then the + number of possible combinations are 5, i.e.: [0,1], [0.25,0.75], [0.50,0.50], [0.75,0.25], and [1.0,0.0] + """ + N = n_prevpoints-1 + C = n_classes + r = n_repeats + return int(scipy.special.binom(N + C - 1, C - 1) * r)
+ + +
[docs]def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repeats:int=1): + """ + Searches for the largest number of (equidistant) prevalence points to define for each of the `n_classes` classes so + that the number of valid prevalence values generated as combinations of prevalence points (points in a + `n_classes`-dimensional simplex) do not exceed combinations_budget. + + :param combinations_budget: integer, maximum number of combinations allowed + :param n_classes: integer, number of classes + :param n_repeats: integer, number of repetitions for each prevalence combination + :return: the largest number of prevalence points that generate less than combinations_budget valid prevalences + """ + assert n_classes > 0 and n_repeats > 0 and combinations_budget > 0, 'parameters must be positive integers' + n_prevpoints = 1 + while True: + combinations = num_prevalence_combinations(n_prevpoints, n_classes, n_repeats) + if combinations > combinations_budget: + return n_prevpoints-1 + else: + n_prevpoints += 1
+ + +
[docs]def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08): + """ + Checks that p is a valid prevalence vector, i.e., that it contains values in [0,1] and that the values sum up to 1. + + :param p: the prevalence vector to check + :return: True if `p` is valid, False otherwise + """ + p = np.asarray(p) + if not all(p>=0): + if raise_exception: + raise ValueError('the prevalence vector contains negative numbers') + return False + if not all(p<=1): + if raise_exception: + raise ValueError('the prevalence vector contains values >1') + return False + if not np.isclose(p.sum(), 1, atol=toleranze): + if raise_exception: + raise ValueError('the prevalence vector does not sum up to 1') + return False + return True
+ + +
[docs]def get_divergence(divergence: Union[str, Callable]): + if isinstance(divergence, str): + if divergence=='HD': + return HellingerDistance + elif divergence=='topsoe': + return TopsoeDistance + else: + raise ValueError(f'unknown divergence {divergence}') + elif callable(divergence): + return divergence + else: + raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
+ + +
[docs]def argmin_prevalence(loss, n_classes, method='optim_minimize'): + if method == 'optim_minimize': + return optim_minimize(loss, n_classes) + elif method == 'linear_search': + return linear_search(loss, n_classes) + elif method == 'ternary_search': + raise NotImplementedError() + else: + raise NotImplementedError()
+ + +
[docs]def optim_minimize(loss, n_classes): + """ + Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex + that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's + SLSQP routine. + + :param loss: (callable) the function to minimize + :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector + :return: (ndarray) the best prevalence vector found + """ + from scipy import optimize + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x
+ + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/_kdey.html b/docs/build/html/_modules/quapy/method/_kdey.html new file mode 100644 index 0000000..4e96e56 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/_kdey.html @@ -0,0 +1,462 @@ + + + + + + quapy.method._kdey — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method._kdey

+from typing import Union
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.neighbors import KernelDensity
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import AggregativeSoftQuantifier
+import quapy.functional as F
+
+from sklearn.metrics.pairwise import rbf_kernel
+
+
+
[docs]class KDEBase: + """ + Common ancestor for KDE-based methods. Implements some common routines. + """ + + BANDWIDTH_METHOD = ['scott', 'silverman'] + + @classmethod + def _check_bandwidth(cls, bandwidth): + """ + Checks that the bandwidth parameter is correct + + :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float + :return: nothing, but raises an exception for invalid values + """ + assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ + f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values' + if isinstance(bandwidth, float): + assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex" + +
[docs] def get_kde_function(self, X, bandwidth): + """ + Wraps the KDE function from scikit-learn. + + :param X: data for which the density function is to be estimated + :param bandwidth: the bandwidth of the kernel + :return: a scikit-learn's KernelDensity object + """ + return KernelDensity(bandwidth=bandwidth).fit(X)
+ +
[docs] def pdf(self, kde, X): + """ + Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this + function returns :math:`e^{s}` + + :param kde: a previously fit KDE function + :param X: the data for which the density is to be estimated + :return: np.ndarray with the densities + """ + return np.exp(kde.score_samples(X))
+ +
[docs] def get_mixture_components(self, X, y, n_classes, bandwidth): + """ + Returns an array containing the mixture components, i.e., the KDE functions for each class. + + :param X: the data containing the covariates + :param y: the class labels + :param n_classes: integer, the number of classes + :param bandwidth: float, the bandwidth of the kernel + :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates + """ + return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
+ + + +
[docs]class KDEyML(AggregativeSoftQuantifier, KDEBase): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the Kullback-Leibler divergence (KLD) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which + the authors show that minimizing the distribution mathing criterion for KLD is akin to performing + maximum likelihood (ML). + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-ML, the divergence is taken to be the Kullback-Leibler Divergence. This is equivalent to solving: + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} - + \\mathbb{E}_{q_{\\widetilde{U}}} \\left[ \\log \\boldsymbol{p}_{\\alpha}(\\widetilde{x}) \\right]` + + which corresponds to the maximum likelihood estimate. + + :param classifier: a sklearn's Estimator that generates a binary classifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param bandwidth: float, the bandwidth of the Kernel + :param n_jobs: number of parallel workers + :param random_state: a seed to be set before fitting any base quantifier (default None) + """ + + def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None): + self._check_bandwidth(bandwidth) + self.classifier = classifier + self.val_split = val_split + self.bandwidth = bandwidth + self.n_jobs = n_jobs + self.random_state=random_state + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth) + return self
+ +
[docs] def aggregate(self, posteriors: np.ndarray): + """ + Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood + of the data (i.e., that minimizes the negative log-likelihood) + + :param posteriors: instances in the sample converted into posterior probabilities + :return: a vector of class prevalence estimates + """ + np.random.RandomState(self.random_state) + epsilon = 1e-10 + n_classes = len(self.mix_densities) + test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities] + + def neg_loglikelihood(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + return F.optim_minimize(neg_loglikelihood, n_classes)
+ + +
[docs]class KDEyHD(AggregativeSoftQuantifier, KDEBase): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the squared Hellinger Disntace (HD) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which + the authors proposed a Monte Carlo approach for minimizing the divergence. + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-HD, the divergence is taken to be the squared Hellinger Distance, an f-divergence with corresponding + f-generator function given by: + + :math:`f(u)=(\\sqrt{u}-1)^2` + + The authors proposed a Monte Carlo solution that relies on importance sampling: + + :math:`\\hat{D}_f(p||q)= \\frac{1}{t} \\sum_{i=1}^t f\\left(\\frac{p(x_i)}{q(x_i)}\\right) \\frac{q(x_i)}{r(x_i)}` + + where the datapoints (trials) :math:`x_1,\\ldots,x_t\\sim_{\\mathrm{iid}} r` with :math:`r` the + uniform distribution. + + :param classifier: a sklearn's Estimator that generates a binary classifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param bandwidth: float, the bandwidth of the Kernel + :param n_jobs: number of parallel workers + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param montecarlo_trials: number of Monte Carlo trials (default 10000) + """ + + def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD', + bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000): + + self._check_bandwidth(bandwidth) + self.classifier = classifier + self.val_split = val_split + self.divergence = divergence + self.bandwidth = bandwidth + self.n_jobs = n_jobs + self.random_state=random_state + self.montecarlo_trials = montecarlo_trials + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth) + + N = self.montecarlo_trials + rs = self.random_state + n = data.n_classes + self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.mix_densities]) + self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities]) + self.reference_density = np.mean(self.reference_classwise_densities, axis=0) # equiv. to (uniform @ self.reference_classwise_densities) + + return self
+ +
[docs] def aggregate(self, posteriors: np.ndarray): + # we retain all n*N examples (sampled from a mixture with uniform parameter), and then + # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS + n_classes = len(self.mix_densities) + + test_kde = self.get_kde_function(posteriors, self.bandwidth) + test_densities = self.pdf(test_kde, self.reference_samples) + + def f_squared_hellinger(u): + return (np.sqrt(u)-1)**2 + + # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway + if self.divergence.lower() == 'hd': + f = f_squared_hellinger + else: + raise ValueError('only squared HD is currently implemented') + + epsilon = 1e-10 + qs = test_densities + epsilon + rs = self.reference_density + epsilon + iw = qs/rs #importance weights + p_class = self.reference_classwise_densities + epsilon + fracs = p_class/qs + + def divergence(prev): + # ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs + ps_div_qs = prev @ fracs + return np.mean( f(ps_div_qs) * iw ) + + return F.optim_minimize(divergence, n_classes)
+ + +
[docs]class KDEyCS(AggregativeSoftQuantifier): + """ + Kernel Density Estimation model for quantification (KDEy) relying on the Cauchy-Schwarz divergence (CS) as + the divergence measure to be minimized. This method was first proposed in the paper + `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which + the authors proposed a Monte Carlo approach for minimizing the divergence. + + The distribution matching optimization problem comes down to solving: + + :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})` + + where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence) + :math:`\\alpha` defined by + + :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})` + + where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the + KDE function that uses the datapoints in X as the kernel centers. + + In KDEy-CS, the divergence is taken to be the Cauchy-Schwarz divergence given by: + + :math:`\\mathcal{D}_{\\mathrm{CS}}(p||q)=-\\log\\left(\\frac{\\int p(x)q(x)dx}{\\sqrt{\\int p(x)^2dx \\int q(x)^2dx}}\\right)` + + The authors showed that this distribution matching admits a closed-form solution + + :param classifier: a sklearn's Estimator that generates a binary classifier. + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param bandwidth: float, the bandwidth of the Kernel + :param n_jobs: number of parallel workers + """ + + def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None): + KDEBase._check_bandwidth(bandwidth) + self.classifier = classifier + self.val_split = val_split + self.bandwidth = bandwidth + self.n_jobs = n_jobs + +
[docs] def gram_matrix_mix_sum(self, X, Y=None): + # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) + # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are + # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth) + h = self.bandwidth + variance = 2 * (h**2) + nD = X.shape[1] + gamma = 1/(2*variance) + norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD))) + gram = norm_factor * rbf_kernel(X, Y, gamma=gamma) + return gram.sum()
+ +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + + P, y = classif_predictions.Xy + n = data.n_classes + + assert all(sorted(np.unique(y)) == np.arange(n)), \ + 'label name gaps not allowed in current implementation' + + # counts_inv keeps track of the relative weight of each datapoint within its class + # (i.e., the weight in its KDE model) + counts_inv = 1 / (data.counts()) + + # tr_tr_sums corresponds to symbol \overline{B} in the paper + tr_tr_sums = np.zeros(shape=(n,n), dtype=float) + for i in range(n): + for j in range(n): + if i > j: + tr_tr_sums[i,j] = tr_tr_sums[j,i] + else: + block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i!=j else None) + tr_tr_sums[i, j] = block + + # keep track of these data structures for the test phase + self.Ptr = P + self.ytr = y + self.tr_tr_sums = tr_tr_sums + self.counts_inv = counts_inv + + return self
+ + +
[docs] def aggregate(self, posteriors: np.ndarray): + Ptr = self.Ptr + Pte = posteriors + y = self.ytr + tr_tr_sums = self.tr_tr_sums + + M, nD = Pte.shape + Minv = (1/M) # t in the paper + n = Ptr.shape[1] + + # becomes a constant that does not affect the optimization, no need to compute it + # partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv) + + # tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants + # are already aggregated to tr_te_sums, so these multiplications are not carried out + # at each iteration of the optimization phase) + tr_te_sums = np.zeros(shape=n, dtype=float) + for i in range(n): + tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) + + def divergence(alpha): + # called \overline{r} in the paper + alpha_ratio = alpha * self.counts_inv + + # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M) + partA = -np.log((alpha_ratio @ tr_te_sums) * Minv) + partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio) + return partA + partB #+ partC + + return F.optim_minimize(divergence, n)
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/_neural.html b/docs/build/html/_modules/quapy/method/_neural.html new file mode 100644 index 0000000..706a7cc --- /dev/null +++ b/docs/build/html/_modules/quapy/method/_neural.html @@ -0,0 +1,520 @@ + + + + + + quapy.method._neural — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method._neural

+import os
+from pathlib import Path
+import random
+
+import torch
+from torch.nn import MSELoss
+from torch.nn.functional import relu
+
+from quapy.protocol import UPP
+from quapy.method.aggregative import *
+from quapy.util import EarlyStop
+from tqdm import tqdm
+
+
+
[docs]class QuaNetTrainer(BaseQuantifier): + """ + Implementation of `QuaNet <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_, a neural network for + quantification. This implementation uses `PyTorch <https://pytorch.org/>`_ and can take advantage of GPU + for speeding-up the training phase. + + Example: + + >>> import quapy as qp + >>> from quapy.method.meta import QuaNet + >>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet + >>> + >>> # use samples of 100 elements + >>> qp.environ['SAMPLE_SIZE'] = 100 + >>> + >>> # load the kindle dataset as text, and convert words to numerical indexes + >>> dataset = qp.datasets.fetch_reviews('kindle', pickle=True) + >>> qp.train.preprocessing.index(dataset, min_df=5, inplace=True) + >>> + >>> # the text classifier is a CNN trained by NeuralClassifierTrainer + >>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) + >>> classifier = NeuralClassifierTrainer(cnn, device='cuda') + >>> + >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer) + >>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda') + >>> model.fit(dataset.training) + >>> estim_prevalence = model.quantify(dataset.test.instances) + + :param classifier: an object implementing `fit` (i.e., that can be trained on labelled data), + `predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and + `transform` (i.e., that can generate embedded representations of the unlabelled instances). + :param sample_size: integer, the sample size; default is None, meaning that the sample size should be + taken from qp.environ["SAMPLE_SIZE"] + :param n_epochs: integer, maximum number of training epochs + :param tr_iter_per_poch: integer, number of training iterations before considering an epoch complete + :param va_iter_per_poch: integer, number of validation iterations to perform after each epoch + :param lr: float, the learning rate + :param lstm_hidden_size: integer, hidden dimensionality of the LSTM cells + :param lstm_nlayers: integer, number of LSTM layers + :param ff_layers: list of integers, dimensions of the densely-connected FF layers on top of the + quantification embedding + :param bidirectional: boolean, indicates whether the LSTM is bidirectional or not + :param qdrop_p: float, dropout probability + :param patience: integer, number of epochs showing no improvement in the validation set before stopping the + training phase (early stopping) + :param checkpointdir: string, a path where to store models' checkpoints + :param checkpointname: string (optional), the name of the model's checkpoint + :param device: string, indicate "cpu" or "cuda" + """ + + def __init__(self, + classifier, + sample_size=None, + n_epochs=100, + tr_iter_per_poch=500, + va_iter_per_poch=100, + lr=1e-3, + lstm_hidden_size=64, + lstm_nlayers=1, + ff_layers=[1024, 512], + bidirectional=True, + qdrop_p=0.5, + patience=10, + checkpointdir='../checkpoint', + checkpointname=None, + device='cuda'): + + assert hasattr(classifier, 'transform'), \ + f'the classifier {classifier.__class__.__name__} does not seem to be able to produce document embeddings ' \ + f'since it does not implement the method "transform"' + assert hasattr(classifier, 'predict_proba'), \ + f'the classifier {classifier.__class__.__name__} does not seem to be able to produce posterior probabilities ' \ + f'since it does not implement the method "predict_proba"' + self.classifier = classifier + self.sample_size = qp._get_sample_size(sample_size) + self.n_epochs = n_epochs + self.tr_iter = tr_iter_per_poch + self.va_iter = va_iter_per_poch + self.lr = lr + self.quanet_params = { + 'lstm_hidden_size': lstm_hidden_size, + 'lstm_nlayers': lstm_nlayers, + 'ff_layers': ff_layers, + 'bidirectional': bidirectional, + 'qdrop_p': qdrop_p + } + + self.patience = patience + if checkpointname is None: + local_random = random.Random() + random_code = '-'.join(str(local_random.randint(0, 1000000)) for _ in range(5)) + checkpointname = 'QuaNet-'+random_code + self.checkpointdir = checkpointdir + self.checkpoint = os.path.join(checkpointdir, checkpointname) + self.device = torch.device(device) + + self.__check_params_colision(self.quanet_params, self.classifier.get_params()) + self._classes_ = None + +
[docs] def fit(self, data: LabelledCollection, fit_classifier=True): + """ + Trains QuaNet. + + :param data: the training data on which to train QuaNet. If `fit_classifier=True`, the data will be split in + 40/40/20 for training the classifier, training QuaNet, and validating QuaNet, respectively. If + `fit_classifier=False`, the data will be split in 66/34 for training QuaNet and validating it, respectively. + :param fit_classifier: if True, trains the classifier on a split containing 40% of the data + :return: self + """ + self._classes_ = data.classes_ + os.makedirs(self.checkpointdir, exist_ok=True) + + if fit_classifier: + classifier_data, unused_data = data.split_stratified(0.4) + train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20% + self.classifier.fit(*classifier_data.Xy) + else: + classifier_data = None + train_data, valid_data = data.split_stratified(0.66) + + # estimate the hard and soft stats tpr and fpr of the classifier + self.tr_prev = data.prevalence() + + # compute the posterior probabilities of the instances + valid_posteriors = self.classifier.predict_proba(valid_data.instances) + train_posteriors = self.classifier.predict_proba(train_data.instances) + + # turn instances' original representations into embeddings + valid_data_embed = LabelledCollection(self.classifier.transform(valid_data.instances), valid_data.labels, self._classes_) + train_data_embed = LabelledCollection(self.classifier.transform(train_data.instances), train_data.labels, self._classes_) + + self.quantifiers = { + 'cc': CC(self.classifier).fit(None, fit_classifier=False), + 'acc': ACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), + 'pcc': PCC(self.classifier).fit(None, fit_classifier=False), + 'pacc': PACC(self.classifier).fit(None, fit_classifier=False, val_split=valid_data), + } + if classifier_data is not None: + self.quantifiers['emq'] = EMQ(self.classifier).fit(classifier_data, fit_classifier=False) + + self.status = { + 'tr-loss': -1, + 'va-loss': -1, + 'tr-mae': -1, + 'va-mae': -1, + } + + nQ = len(self.quantifiers) + nC = data.n_classes + self.quanet = QuaNetModule( + doc_embedding_size=train_data_embed.instances.shape[1], + n_classes=data.n_classes, + stats_size=nQ*nC, + order_by=0 if data.binary else None, + **self.quanet_params + ).to(self.device) + print(self.quanet) + + self.optim = torch.optim.Adam(self.quanet.parameters(), lr=self.lr) + early_stop = EarlyStop(self.patience, lower_is_better=True) + + checkpoint = self.checkpoint + + for epoch_i in range(1, self.n_epochs): + self._epoch(train_data_embed, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True) + self._epoch(valid_data_embed, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False) + + early_stop(self.status['va-loss'], epoch_i) + if early_stop.IMPROVED: + torch.save(self.quanet.state_dict(), checkpoint) + elif early_stop.STOP: + print(f'training ended by patience exhausted; loading best model parameters in {checkpoint} ' + f'for epoch {early_stop.best_epoch}') + self.quanet.load_state_dict(torch.load(checkpoint)) + break + + return self
+ + def _get_aggregative_estims(self, posteriors): + label_predictions = np.argmax(posteriors, axis=-1) + prevs_estim = [] + for quantifier in self.quantifiers.values(): + predictions = posteriors if isinstance(quantifier, AggregativeSoftQuantifier) else label_predictions + prevs_estim.extend(quantifier.aggregate(predictions)) + + # there is no real need for adding static estims like the TPR or FPR from training since those are constant + + return prevs_estim + +
[docs] def quantify(self, instances): + posteriors = self.classifier.predict_proba(instances) + embeddings = self.classifier.transform(instances) + quant_estims = self._get_aggregative_estims(posteriors) + self.quanet.eval() + with torch.no_grad(): + prevalence = self.quanet.forward(embeddings, posteriors, quant_estims) + if self.device == torch.device('cuda'): + prevalence = prevalence.cpu() + prevalence = prevalence.numpy().flatten() + return prevalence
+ + def _epoch(self, data: LabelledCollection, posteriors, iterations, epoch, early_stop, train): + mse_loss = MSELoss() + + self.quanet.train(mode=train) + losses = [] + mae_errors = [] + sampler = UPP( + data, + sample_size=self.sample_size, + repeats=iterations, + random_state=None if train else 0 # different samples during train, same samples during validation + ) + pbar = tqdm(sampler.samples_parameters(), total=sampler.total()) + for it, index in enumerate(pbar): + sample_data = data.sampling_from_index(index) + sample_posteriors = posteriors[index] + quant_estims = self._get_aggregative_estims(sample_posteriors) + ptrue = torch.as_tensor([sample_data.prevalence()], dtype=torch.float, device=self.device) + if train: + self.optim.zero_grad() + phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims) + loss = mse_loss(phat, ptrue) + mae = mae_loss(phat, ptrue) + loss.backward() + self.optim.step() + else: + with torch.no_grad(): + phat = self.quanet.forward(sample_data.instances, sample_posteriors, quant_estims) + loss = mse_loss(phat, ptrue) + mae = mae_loss(phat, ptrue) + + losses.append(loss.item()) + mae_errors.append(mae.item()) + + mse = np.mean(losses) + mae = np.mean(mae_errors) + if train: + self.status['tr-loss'] = mse + self.status['tr-mae'] = mae + else: + self.status['va-loss'] = mse + self.status['va-mae'] = mae + + if train: + pbar.set_description(f'[QuaNet] ' + f'epoch={epoch} [it={it}/{iterations}]\t' + f'tr-mseloss={self.status["tr-loss"]:.5f} tr-maeloss={self.status["tr-mae"]:.5f}\t' + f'val-mseloss={self.status["va-loss"]:.5f} val-maeloss={self.status["va-mae"]:.5f} ' + f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}') + +
[docs] def get_params(self, deep=True): + classifier_params = self.classifier.get_params() + classifier_params = {'classifier__'+k:v for k,v in classifier_params.items()} + return {**classifier_params, **self.quanet_params}
+ +
[docs] def set_params(self, **parameters): + learner_params = {} + for key, val in parameters.items(): + if key in self.quanet_params: + self.quanet_params[key] = val + elif key.startswith('classifier__'): + learner_params[key.replace('classifier__', '')] = val + else: + raise ValueError('unknown parameter ', key) + self.classifier.set_params(**learner_params)
+ + def __check_params_colision(self, quanet_params, learner_params): + quanet_keys = set(quanet_params.keys()) + learner_keys = set(learner_params.keys()) + intersection = quanet_keys.intersection(learner_keys) + if len(intersection) > 0: + raise ValueError(f'the use of parameters {intersection} is ambiguous sine those can refer to ' + f'the parameters of QuaNet or the learner {self.classifier.__class__.__name__}') + +
[docs] def clean_checkpoint(self): + """ + Removes the checkpoint + """ + os.remove(self.checkpoint)
+ +
[docs] def clean_checkpoint_dir(self): + """ + Removes anything contained in the checkpoint directory + """ + import shutil + shutil.rmtree(self.checkpointdir, ignore_errors=True)
+ + @property + def classes_(self): + return self._classes_
+ + +
[docs]def mae_loss(output, target): + """ + Torch-like wrapper for the Mean Absolute Error + + :param output: predictions + :param target: ground truth values + :return: mean absolute error loss + """ + return torch.mean(torch.abs(output - target))
+ + +
[docs]class QuaNetModule(torch.nn.Module): + """ + Implements the `QuaNet <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_ forward pass. + See :class:`QuaNetTrainer` for training QuaNet. + + :param doc_embedding_size: integer, the dimensionality of the document embeddings + :param n_classes: integer, number of classes + :param stats_size: integer, number of statistics estimated by simple quantification methods + :param lstm_hidden_size: integer, hidden dimensionality of the LSTM cell + :param lstm_nlayers: integer, number of LSTM layers + :param ff_layers: list of integers, dimensions of the densely-connected FF layers on top of the + quantification embedding + :param bidirectional: boolean, whether or not to use bidirectional LSTM + :param qdrop_p: float, dropout probability + :param order_by: integer, class for which the document embeddings are to be sorted + """ + + def __init__(self, + doc_embedding_size, + n_classes, + stats_size, + lstm_hidden_size=64, + lstm_nlayers=1, + ff_layers=[1024, 512], + bidirectional=True, + qdrop_p=0.5, + order_by=0): + + super().__init__() + + self.n_classes = n_classes + self.order_by = order_by + self.hidden_size = lstm_hidden_size + self.nlayers = lstm_nlayers + self.bidirectional = bidirectional + self.ndirections = 2 if self.bidirectional else 1 + self.qdrop_p = qdrop_p + self.lstm = torch.nn.LSTM(doc_embedding_size + n_classes, # +n_classes stands for the posterior probs. (concatenated) + lstm_hidden_size, lstm_nlayers, bidirectional=bidirectional, + dropout=qdrop_p, batch_first=True) + self.dropout = torch.nn.Dropout(self.qdrop_p) + + lstm_output_size = self.hidden_size * self.ndirections + ff_input_size = lstm_output_size + stats_size + prev_size = ff_input_size + self.ff_layers = torch.nn.ModuleList() + for lin_size in ff_layers: + self.ff_layers.append(torch.nn.Linear(prev_size, lin_size)) + prev_size = lin_size + self.output = torch.nn.Linear(prev_size, n_classes) + + @property + def device(self): + return torch.device('cuda') if next(self.parameters()).is_cuda else torch.device('cpu') + + def _init_hidden(self): + directions = 2 if self.bidirectional else 1 + var_hidden = torch.zeros(self.nlayers * directions, 1, self.hidden_size) + var_cell = torch.zeros(self.nlayers * directions, 1, self.hidden_size) + if next(self.lstm.parameters()).is_cuda: + var_hidden, var_cell = var_hidden.cuda(), var_cell.cuda() + return var_hidden, var_cell + +
[docs] def forward(self, doc_embeddings, doc_posteriors, statistics): + device = self.device + doc_embeddings = torch.as_tensor(doc_embeddings, dtype=torch.float, device=device) + doc_posteriors = torch.as_tensor(doc_posteriors, dtype=torch.float, device=device) + statistics = torch.as_tensor(statistics, dtype=torch.float, device=device) + + if self.order_by is not None: + order = torch.argsort(doc_posteriors[:, self.order_by]) + doc_embeddings = doc_embeddings[order] + doc_posteriors = doc_posteriors[order] + + embeded_posteriors = torch.cat((doc_embeddings, doc_posteriors), dim=-1) + + # the entire set represents only one instance in quapy contexts, and so the batch_size=1 + # the shape should be (1, number-of-instances, embedding-size + n_classes) + embeded_posteriors = embeded_posteriors.unsqueeze(0) + + self.lstm.flatten_parameters() + _, (rnn_hidden,_) = self.lstm(embeded_posteriors, self._init_hidden()) + rnn_hidden = rnn_hidden.view(self.nlayers, self.ndirections, 1, self.hidden_size) + quant_embedding = rnn_hidden[0].view(-1) + quant_embedding = torch.cat((quant_embedding, statistics)) + + abstracted = quant_embedding.unsqueeze(0) + for linear in self.ff_layers: + abstracted = self.dropout(relu(linear(abstracted))) + + logits = self.output(abstracted).view(1, -1) + prevalence = torch.softmax(logits, -1) + + return prevalence
+ + + + + +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/_threshold_optim.html b/docs/build/html/_modules/quapy/method/_threshold_optim.html new file mode 100644 index 0000000..486aa61 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/_threshold_optim.html @@ -0,0 +1,364 @@ + + + + + + quapy.method._threshold_optim — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for quapy.method._threshold_optim

+from abc import abstractmethod
+
+import numpy as np
+from sklearn.base import BaseEstimator
+import quapy as qp
+import quapy.functional as F
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import BinaryAggregativeQuantifier
+
+
+
[docs]class ThresholdOptimization(BinaryAggregativeQuantifier): + """ + Abstract class of Threshold Optimization variants for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_. + The goal is to bring improved stability to the denominator of the adjustment. + The different variants are based on different heuristics for choosing a decision threshold + that would allow for more true positives and many more false positives, on the grounds this + would deliver larger denominators. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + + def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None): + self.classifier = classifier + self.val_split = val_split + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] @abstractmethod + def condition(self, tpr, fpr) -> float: + """ + Implements the criterion according to which the threshold should be selected. + This function should return the (float) score to be minimized. + + :param tpr: float, true positive rate + :param fpr: float, false positive rate + :return: float, a score for the given `tpr` and `fpr` + """ + ...
+ +
[docs] def discard(self, tpr, fpr) -> bool: + """ + Indicates whether a combination of tpr and fpr should be discarded + + :param tpr: float, true positive rate + :param fpr: float, false positive rate + :return: true if the combination is to be discarded, false otherwise + """ + return (tpr - fpr) == 0
+ + + def _eval_candidate_thresholds(self, decision_scores, y): + """ + Seeks for the best `tpr` and `fpr` according to the score obtained at different + decision thresholds. The scoring function is implemented in function `_condition`. + + :param decision_scores: array-like with the classification scores + :param y: predicted labels for the validation set (or for the training set via `k`-fold cross validation) + :return: best `tpr` and `fpr` and `threshold` according to `_condition` + """ + candidate_thresholds = np.unique(decision_scores) + + candidates = [] + scores = [] + for candidate_threshold in candidate_thresholds: + y_ = self.classes_[1 * (decision_scores >= candidate_threshold)] + TP, FP, FN, TN = self._compute_table(y, y_) + tpr = self._compute_tpr(TP, FN) + fpr = self._compute_fpr(FP, TN) + if not self.discard(tpr, fpr): + candidate_score = self.condition(tpr, fpr) + candidates.append([tpr, fpr, candidate_threshold]) + scores.append(candidate_score) + + if len(candidates) == 0: + # if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard + # classify & count; this is akin to assign tpr=1, fpr=0, threshold=0 + tpr, fpr, threshold = 1, 0, 0 + candidates.append([tpr, fpr, threshold]) + scores.append(0) + + candidates = np.asarray(candidates) + candidates = candidates[np.argsort(scores)] # sort candidates by candidate_score + + return candidates + +
[docs] def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds): + # This function performs the adjusted count for given tpr, fpr, and threshold. + # Note that, due to broadcasting, tprs, fprs, and thresholds could be arrays of length > 1 + prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0) + prevs_estims = (prevs_estims - fprs) / (tprs - fprs) + prevs_estims = F.as_binary_prevalence(prevs_estims, clip_if_necessary=True) + return prevs_estims.squeeze()
+ + def _compute_table(self, y, y_): + TP = np.logical_and(y == y_, y == self.pos_label).sum() + FP = np.logical_and(y != y_, y == self.neg_label).sum() + FN = np.logical_and(y != y_, y == self.pos_label).sum() + TN = np.logical_and(y == y_, y == self.neg_label).sum() + return TP, FP, FN, TN + + def _compute_tpr(self, TP, FP): + if TP + FP == 0: + return 1 + return TP / (TP + FP) + + def _compute_fpr(self, FP, TN): + if FP + TN == 0: + return 0 + return FP / (FP + TN) + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + decision_scores, y = classif_predictions.Xy + # the standard behavior is to keep the best threshold only + self.tpr, self.fpr, self.threshold = self._eval_candidate_thresholds(decision_scores, y)[0] + return self
+ +
[docs] def aggregate(self, classif_predictions: np.ndarray): + # the standard behavior is to compute the adjusted count using the best threshold found + return self.aggregate_with_threshold(classif_predictions, self.tpr, self.fpr, self.threshold)
+ + +
[docs]class T50(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks + for the threshold that makes `tpr` closest to 0.5. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def condition(self, tpr, fpr) -> float: + return abs(tpr - 0.5)
+ + +
[docs]class MAX(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks + for the threshold that maximizes `tpr-fpr`. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def condition(self, tpr, fpr) -> float: + # MAX strives to maximize (tpr - fpr), which is equivalent to minimize (fpr - tpr) + return (fpr - tpr)
+ + +
[docs]class X(ThresholdOptimization): + """ + Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks + for the threshold that yields `tpr=1-fpr`. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def condition(self, tpr, fpr) -> float: + return abs(1 - (tpr + fpr))
+ + +
[docs]class MS(ThresholdOptimization): + """ + Median Sweep. Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates + class prevalence estimates for all decision thresholds and returns the median of them all. + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def condition(self, tpr, fpr) -> float: + return 1
+ +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + decision_scores, y = classif_predictions.Xy + # keeps all candidates + tprs_fprs_thresholds = self._eval_candidate_thresholds(decision_scores, y) + self.tprs = tprs_fprs_thresholds[:, 0] + self.fprs = tprs_fprs_thresholds[:, 1] + self.thresholds = tprs_fprs_thresholds[:, 2] + return self
+ +
[docs] def aggregate(self, classif_predictions: np.ndarray): + prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds) + if prevalences.ndim==2: + prevalences = np.median(prevalences, axis=0) + return prevalences
+ + +
[docs]class MS2(MS): + """ + Median Sweep 2. Threshold Optimization variant for :class:`ACC` as proposed by + `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and + `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that generates + class prevalence estimates for all decision thresholds and returns the median of for cases in + which `tpr-fpr>0.25` + The goal is to bring improved stability to the denominator of the adjustment. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set in which the + misclassification rates are to be estimated. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the misclassification rates should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + """ + def __init__(self, classifier: BaseEstimator, val_split=5): + super().__init__(classifier, val_split) + +
[docs] def discard(self, tpr, fpr) -> bool: + return (tpr-fpr) <= 0.25
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/aggregative.html b/docs/build/html/_modules/quapy/method/aggregative.html new file mode 100644 index 0000000..8311baa --- /dev/null +++ b/docs/build/html/_modules/quapy/method/aggregative.html @@ -0,0 +1,1440 @@ + + + + + + quapy.method.aggregative — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method.aggregative

+from abc import ABC, abstractmethod
+from copy import deepcopy
+from typing import Callable, Union
+import numpy as np
+from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
+from scipy import optimize
+from sklearn.base import BaseEstimator
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import cross_val_predict
+
+import quapy as qp
+import quapy.functional as F
+from quapy.functional import get_divergence
+from quapy.classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
+from quapy.classification.svmperf import SVMperf
+from quapy.data import LabelledCollection
+from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
+
+
+# Abstract classes
+# ------------------------------------
+
+
[docs]class AggregativeQuantifier(BaseQuantifier, ABC): + """ + Abstract class for quantification methods that base their estimations on the aggregation of classification + results. Aggregative quantifiers implement a pipeline that consists of generating classification predictions + and aggregating them. For this reason, the training phase is implemented by :meth:`classification_fit` followed + by :meth:`aggregation_fit`, while the testing phase is implemented by :meth:`classify` followed by + :meth:`aggregate`. Subclasses of this abstract class must provide implementations for these methods. + Aggregative quantifiers also maintain a :attr:`classifier` attribute. + + The method :meth:`fit` comes with a default implementation based on :meth:`classification_fit` + and :meth:`aggregation_fit`. + + The method :meth:`quantify` comes with a default implementation based on :meth:`classify` + and :meth:`aggregate`. + """ + + val_split_ = None + + @property + def val_split(self): + return self.val_split_ + + @val_split.setter + def val_split(self, val_split): + if isinstance(val_split, LabelledCollection): + print('warning: setting val_split with a LabelledCollection will be inefficient in' + 'model selection. Rather pass the LabelledCollection at fit time') + self.val_split_ = val_split + + def _check_init_parameters(self): + """ + Implements any check to be performed in the parameters of the init method before undertaking + the training of the quantifier. This is made as to allow for a quick execution stop when the + parameters are not valid. + + :return: Nothing. May raise an exception. + """ + pass + + def _check_non_empty_classes(self, data: LabelledCollection): + """ + Asserts all classes have positive instances. + + :param data: LabelledCollection + :return: Nothing. May raise an exception. + """ + sample_prevs = data.prevalence() + empty_classes = np.argwhere(sample_prevs==0).flatten() + if len(empty_classes)>0: + empty_class_names = data.classes_[empty_classes] + raise ValueError(f'classes {empty_class_names} have no training examples') + +
[docs] def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): + """ + Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function. + + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :return: self + """ + self._check_init_parameters() + classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split) + self.aggregation_fit(classif_predictions, data) + return self
+ +
[docs] def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, predict_on=None): + """ + Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to + train the aggregation function. + + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param fit_classifier: whether to train the learner (default is True). Set to False if the + learner has been trained outside the quantifier. + :param predict_on: specifies the set on which predictions need to be issued. This parameter can + be specified as None (default) to indicate no prediction is needed; a float in (0, 1) to + indicate the proportion of instances to be used for predictions (the remainder is used for + training); an integer >1 to indicate that the predictions must be generated via k-fold + cross-validation, using this integer as k; or the data sample itself on which to generate + the predictions. + """ + assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean' + + self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba')) + + if fit_classifier: + self._check_non_empty_classes(data) + + if predict_on is None: + predict_on = self.val_split + + if predict_on is None: + if fit_classifier: + self.classifier.fit(*data.Xy) + predictions = None + elif isinstance(predict_on, float): + if fit_classifier: + if not (0. < predict_on < 1.): + raise ValueError(f'proportion {predict_on=} out of range, must be in (0,1)') + train, val = data.split_stratified(train_prop=(1 - predict_on)) + self.classifier.fit(*train.Xy) + predictions = LabelledCollection(self.classify(val.X), val.y, classes=data.classes_) + else: + raise ValueError(f'wrong type for predict_on: since fit_classifier=False, ' + f'the set on which predictions have to be issued must be ' + f'explicitly indicated') + + elif isinstance(predict_on, LabelledCollection): + if fit_classifier: + self.classifier.fit(*data.Xy) + predictions = LabelledCollection(self.classify(predict_on.X), predict_on.y, classes=predict_on.classes_) + + elif isinstance(predict_on, int): + if fit_classifier: + if predict_on <= 1: + raise ValueError(f'invalid value {predict_on} in fit. ' + f'Specify a integer >1 for kFCV estimation.') + else: + n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None) + predictions = cross_val_predict( + self.classifier, *data.Xy, cv=predict_on, n_jobs=n_jobs, method=self._classifier_method()) + predictions = LabelledCollection(predictions, data.y, classes=data.classes_) + self.classifier.fit(*data.Xy) + else: + raise ValueError(f'wrong type for predict_on: since fit_classifier=False, ' + f'the set on which predictions have to be issued must be ' + f'explicitly indicated') + + else: + raise ValueError( + f'error: param "predict_on" ({type(predict_on)}) not understood; ' + f'use either a float indicating the split proportion, or a ' + f'tuple (X,y) indicating the validation partition') + + return predictions
+ +
[docs] @abstractmethod + def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains the aggregation function. + + :param classif_predictions: a LabelledCollection containing the label predictions issued + by the classifier + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + """ + ...
+ + @property + def classifier(self): + """ + Gives access to the classifier + + :return: the classifier (typically an sklearn's Estimator) + """ + return self.classifier_ + + @classifier.setter + def classifier(self, classifier): + """ + Setter for the classifier + + :param classifier: the classifier + """ + self.classifier_ = classifier + +
[docs] def classify(self, instances): + """ + Provides the label predictions for the given instances. The predictions should respect the format expected by + :meth:`aggregate`, e.g., posterior probabilities for probabilistic quantifiers, or crisp predictions for + non-probabilistic quantifiers. The default one is "decision_function". + + :param instances: array-like of shape `(n_instances, n_features,)` + :return: np.ndarray of shape `(n_instances,)` with label predictions + """ + return getattr(self.classifier, self._classifier_method())(instances)
+ + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. The default one is "decision_function". + + :return: string + """ + return 'decision_function' + + def _check_classifier(self, adapt_if_necessary=False): + """ + Guarantees that the underlying classifier implements the method required for issuing predictions, i.e., + the method indicated by the :meth:`_classifier_method` + + :param adapt_if_necessary: if True, the method will try to comply with the required specifications + """ + assert hasattr(self.classifier, self._classifier_method()), \ + f"the method does not implement the required {self._classifier_method()} method" + +
[docs] def quantify(self, instances): + """ + Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated + by the classifier. + + :param instances: array-like + :return: `np.ndarray` of shape `(n_classes)` with class prevalence estimates. + """ + classif_predictions = self.classify(instances) + return self.aggregate(classif_predictions)
+ +
[docs] @abstractmethod + def aggregate(self, classif_predictions: np.ndarray): + """ + Implements the aggregation of label predictions. + + :param classif_predictions: `np.ndarray` of label predictions + :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. + """ + ...
+ + @property + def classes_(self): + """ + Class labels, in the same order in which class prevalence values are to be computed. + This default implementation actually returns the class labels of the learner. + + :return: array-like + """ + return self.classifier.classes_
+ + +
[docs]class AggregativeCrispQuantifier(AggregativeQuantifier, ABC): + """ + Abstract class for quantification methods that base their estimations on the aggregation of crips decisions + as returned by a hard classifier. Aggregative crisp quantifiers thus extend Aggregative + Quantifiers by implementing specifications about crisp predictions. + """ + + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. For crisp quantifiers, the method + is 'predict', that returns an array of shape `(n_instances,)` of label predictions. + + :return: the string "predict", i.e., the standard method name for scikit-learn hard predictions + """ + return 'predict'
+ + +
[docs]class AggregativeSoftQuantifier(AggregativeQuantifier, ABC): + """ + Abstract class for quantification methods that base their estimations on the aggregation of posterior + probabilities as returned by a probabilistic classifier. + Aggregative soft quantifiers thus extend Aggregative Quantifiers by implementing specifications + about soft predictions. + """ + + def _classifier_method(self): + """ + Name of the method that must be used for issuing label predictions. For probabilistic quantifiers, the method + is 'predict_proba', that returns an array of shape `(n_instances, n_dimensions,)` with posterior + probabilities. + + :return: the string "predict_proba", i.e., the standard method name for scikit-learn soft predictions + """ + return 'predict_proba' + + def _check_classifier(self, adapt_if_necessary=False): + """ + Guarantees that the underlying classifier implements the method indicated by the :meth:`_classifier_method`. + In case it does not, the classifier is calibrated (by means of the Platt's calibration method implemented by + scikit-learn in CalibratedClassifierCV, with cv=5). This calibration is only allowed if `adapt_if_necessary` + is set to True. If otherwise (i.e., the classifier is not probabilistic, and `adapt_if_necessary` is set + to False), an exception will be raised. + + :param adapt_if_necessary: a hard classifier is turned into a soft classifier if `adapt_if_necessary==True` + """ + if not hasattr(self.classifier, self._classifier_method()): + if adapt_if_necessary: + print(f'warning: The learner {self.classifier.__class__.__name__} does not seem to be ' + f'probabilistic. The learner will be calibrated (using CalibratedClassifierCV).') + self.classifier = CalibratedClassifierCV(self.classifier, cv=5) + else: + raise AssertionError(f'error: The learner {self.classifier.__class__.__name__} does not ' + f'seem to be probabilistic. The learner cannot be calibrated since ' + f'fit_classifier is set to False')
+ + +
[docs]class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier): + + @property + def pos_label(self): + return self.classifier.classes_[1] + + @property + def neg_label(self): + return self.classifier.classes_[0] + +
[docs] def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): + self._check_binary(data, self.__class__.__name__) + return super().fit(data, fit_classifier, val_split)
+ + +# Methods +# ------------------------------------ +
[docs]class CC(AggregativeCrispQuantifier): + """ + The most basic Quantification method. One that simply classifies all instances and counts how many have been + attributed to each of the classes in order to compute class prevalence estimates. + + :param classifier: a sklearn's Estimator that generates a classifier + """ + + def __init__(self, classifier: BaseEstimator): + self.classifier = classifier + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Nothing to do here! + + :param classif_predictions: this is actually None + """ + pass
+ +
[docs] def aggregate(self, classif_predictions: np.ndarray): + """ + Computes class prevalence estimates by counting the prevalence of each of the predicted labels. + + :param classif_predictions: array-like with label predictions + :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. + """ + return F.prevalence_from_labels(classif_predictions, self.classes_)
+ + +
[docs]class ACC(AggregativeCrispQuantifier): + """ + `Adjusted Classify & Count <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_, + the "adjusted" variant of :class:`CC`, that corrects the predictions of CC + according to the `misclassification rates`. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param n_jobs: number of parallel workers + :param solver: indicates the method to be used for obtaining the final estimates. The choice + 'exact' comes down to solving the system of linear equations :math:`Ax=B` where `A` is a + matrix containing the class-conditional probabilities of the predictions (e.g., the tpr and fpr in + binary) and `B` is the vector of prevalence values estimated via CC, as :math:`x=A^{-1}B`. This solution + might not exist for degenerated classifiers, in which case the method defaults to classify and count + (i.e., does not attempt any adjustment). + Another option is to search for the prevalence vector that minimizes the L2 norm of :math:`|Ax-B|`. The latter + is achieved by indicating solver='minimize'. This one generally works better, and is the default parameter. + More details about this can be consulted in `Bunse, M. "On Multi-Class Extensions of Adjusted Classify and + Count", on proceedings of the 2nd International Workshop on Learning to Quantify: Methods and Applications + (LQ 2022), ECML/PKDD 2022, Grenoble (France) <https://lq-2022.github.io/proceedings/CompleteVolume.pdf>`_. + """ + + def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None, solver='minimize'): + self.classifier = classifier + self.val_split = val_split + self.n_jobs = qp._get_njobs(n_jobs) + self.solver = solver + + def _check_init_parameters(self): + assert self.solver in ['exact', 'minimize'], "unknown solver; valid ones are 'exact', 'minimize'" + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Estimates the misclassification rates. + + :param classif_predictions: classifier predictions with true labels + """ + pred_labels, true_labels = classif_predictions.Xy + self.cc = CC(self.classifier) + self.Pte_cond_estim_ = self.getPteCondEstim(self.classifier.classes_, true_labels, pred_labels)
+ +
[docs] @classmethod + def getPteCondEstim(cls, classes, y, y_): + # estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a + # document that belongs to yj ends up being classified as belonging to yi + conf = confusion_matrix(y, y_, labels=classes).T + conf = conf.astype(float) + class_counts = conf.sum(axis=0) + for i, _ in enumerate(classes): + if class_counts[i] == 0: + conf[i, i] = 1 + else: + conf[:, i] /= class_counts[i] + return conf
+ +
[docs] def aggregate(self, classif_predictions): + prevs_estim = self.cc.aggregate(classif_predictions) + return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim, solver=self.solver)
+ +
[docs] @classmethod + def solve_adjustment(cls, PteCondEstim, prevs_estim, solver='exact'): + """ + Solves the system linear system :math:`Ax = B` with :math:`A` = `PteCondEstim` and :math:`B` = `prevs_estim` + + :param PteCondEstim: a `np.ndarray` of shape `(n_classes,n_classes,)` with entry `(i,j)` being the estimate + of :math:`P(y_i|y_j)`, that is, the probability that an instance that belongs to :math:`y_j` ends up being + classified as belonging to :math:`y_i` + :param prevs_estim: a `np.ndarray` of shape `(n_classes,)` with the class prevalence estimates + :param solver: indicates the method to use for solving the system of linear equations. Valid options are + 'exact' (tries to solve the system --may fail if the misclassificatin matrix has rank < n_classes) or + 'optim_minimize' (minimizes a norm --always exists). + :return: an adjusted `np.ndarray` of shape `(n_classes,)` with the corrected class prevalence estimates + """ + + A = PteCondEstim + B = prevs_estim + + if solver == 'exact': + # attempts an exact solution of the linear system (may fail) + + try: + adjusted_prevs = np.linalg.solve(A, B) + adjusted_prevs = np.clip(adjusted_prevs, 0, 1) + adjusted_prevs /= adjusted_prevs.sum() + except np.linalg.LinAlgError: + adjusted_prevs = prevs_estim # no way to adjust them! + + return adjusted_prevs + + elif solver == 'minimize': + # poses the problem as an optimization one, and tries to minimize the norm of the differences + + def loss(prev): + return np.linalg.norm(A @ prev - B) + + return F.optim_minimize(loss, n_classes=A.shape[0])
+ + +
[docs]class PCC(AggregativeSoftQuantifier): + """ + `Probabilistic Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_, + the probabilistic variant of CC that relies on the posterior probabilities returned by a probabilistic classifier. + + :param classifier: a sklearn's Estimator that generates a classifier + """ + + def __init__(self, classifier: BaseEstimator): + self.classifier = classifier + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Nothing to do here! + + :param classif_predictions: this is actually None + """ + pass
+ +
[docs] def aggregate(self, classif_posteriors): + return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
+ + +
[docs]class PACC(AggregativeSoftQuantifier): + """ + `Probabilistic Adjusted Classify & Count <https://ieeexplore.ieee.org/abstract/document/5694031>`_, + the probabilistic variant of ACC that relies on the posterior probabilities returned by a probabilistic classifier. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer (default 5), indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`). Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. + :param n_jobs: number of parallel workers + :param solver: indicates the method to be used for obtaining the final estimates. The choice + 'exact' comes down to solving the system of linear equations :math:`Ax=B` where `A` is a + matrix containing the class-conditional probabilities of the predictions (e.g., the tpr and fpr in + binary) and `B` is the vector of prevalence values estimated via CC, as :math:`x=A^{-1}B`. This solution + might not exist for degenerated classifiers, in which case the method defaults to classify and count + (i.e., does not attempt any adjustment). + Another option is to search for the prevalence vector that minimizes the L2 norm of :math:`|Ax-B|`. The latter + is achieved by indicating solver='minimize'. This one generally works better, and is the default parameter. + More details about this can be consulted in `Bunse, M. "On Multi-Class Extensions of Adjusted Classify and + Count", on proceedings of the 2nd International Workshop on Learning to Quantify: Methods and Applications + (LQ 2022), ECML/PKDD 2022, Grenoble (France) <https://lq-2022.github.io/proceedings/CompleteVolume.pdf>`_. + + """ + + def __init__(self, classifier: BaseEstimator, val_split=5, n_jobs=None, solver='minimize'): + self.classifier = classifier + self.val_split = val_split + self.n_jobs = qp._get_njobs(n_jobs) + self.solver = solver + + def _check_init_parameters(self): + assert self.solver in ['exact', 'minimize'], "unknown solver; valid ones are 'exact', 'minimize'" + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Estimates the misclassification rates + + :param classif_predictions: classifier soft predictions with true labels + """ + posteriors, true_labels = classif_predictions.Xy + self.pcc = PCC(self.classifier) + self.Pte_cond_estim_ = self.getPteCondEstim(self.classifier.classes_, true_labels, posteriors)
+ +
[docs] def aggregate(self, classif_posteriors): + prevs_estim = self.pcc.aggregate(classif_posteriors) + return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim, solver=self.solver)
+ +
[docs] @classmethod + def getPteCondEstim(cls, classes, y, y_): + # estimate the matrix with entry (i,j) being the estimate of P(hat_yi|yj), that is, the probability that a + # document that belongs to yj ends up being classified as belonging to yi + n_classes = len(classes) + confusion = np.eye(n_classes) + for i, class_ in enumerate(classes): + idx = y == class_ + if idx.any(): + confusion[i] = y_[idx].mean(axis=0) + + return confusion.T
+ + +
[docs]class EMQ(AggregativeSoftQuantifier): + """ + `Expectation Maximization for Quantification <https://ieeexplore.ieee.org/abstract/document/6789744>`_ (EMQ), + aka `Saerens-Latinne-Decaestecker` (SLD) algorithm. + EMQ consists of using the well-known `Expectation Maximization algorithm` to iteratively update the posterior + probabilities generated by a probabilistic classifier and the class prevalence estimates obtained via + maximum-likelihood estimation, in a mutually recursive way, until convergence. + + This implementation also gives access to the heuristics proposed by `Alexandari et al. paper + <http://proceedings.mlr.press/v119/alexandari20a.html>`_. These heuristics consist of using, as the training + prevalence, an estimate of it obtained via k-fold cross validation (instead of the true training prevalence), + and to recalibrate the posterior probabilities of the classifier. + + :param classifier: a sklearn's Estimator that generates a classifier + :param val_split: specifies the data used for generating classifier predictions. This specification + can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to + be extracted from the training set; or as an integer, indicating that the predictions + are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value + for `k`, default 5); or as a collection defining the specific set of data to use for validation. + Alternatively, this set can be specified at fit time by indicating the exact set of data + on which the predictions are to be generated. This hyperparameter is only meant to be used when the + heuristics are to be applied, i.e., if a recalibration is required. The default value is None (meaning + the recalibration is not required). In case this hyperparameter is set to a value other than None, but + the recalibration is not required (recalib=None), a warning message will be raised. + :param exact_train_prev: set to True (default) for using the true training prevalence as the initial observation; + set to False for computing the training prevalence as an estimate of it, i.e., as the expected + value of the posterior probabilities of the training instances. + :param recalib: a string indicating the method of recalibration. + Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling, + default), "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no recalibration). + :param n_jobs: number of parallel workers. Only used for recalibrating the classifier if `val_split` is set to + an integer `k` --the number of folds. + """ + + MAX_ITER = 1000 + EPSILON = 1e-4 + + def __init__(self, classifier: BaseEstimator, val_split=None, exact_train_prev=True, recalib=None, n_jobs=None): + self.classifier = classifier + self.val_split = val_split + self.exact_train_prev = exact_train_prev + self.recalib = recalib + self.n_jobs = n_jobs + +
[docs] @classmethod + def EMQ_BCTS(cls, classifier: BaseEstimator, n_jobs=None): + """ + Constructs an instance of EMQ using the best configuration found in the `Alexandari et al. paper + <http://proceedings.mlr.press/v119/alexandari20a.html>`_, i.e., one that relies on Bias-Corrected Temperature + Scaling (BCTS) as a recalibration function, and that uses an estimate of the training prevalence instead of + the true training prevalence. + + :param classifier: a sklearn's Estimator that generates a classifier + :param n_jobs: number of parallel workers. + :return: An instance of EMQ with BCTS + """ + return EMQ(classifier, val_split=5, exact_train_prev=False, recalib='bcts', n_jobs=n_jobs)
+ + def _check_init_parameters(self): + if self.val_split is not None: + if self.exact_train_prev and self.recalib is None: + raise RuntimeWarning(f'The parameter {self.val_split=} was specified for EMQ, while the parameters ' + f'{self.exact_train_prev=} and {self.recalib=}. This has no effect and causes an unnecessary ' + f'overload.') + else: + if self.recalib is not None: + print(f'[warning] The parameter {self.recalib=} requires the val_split be different from None. ' + f'This parameter will be set to 5. To avoid this warning, set this value to a float value ' + f'indicating the proportion of training data to be used as validation, or to an integer ' + f'indicating the number of folds for kFCV.') + self.val_split=5 + +
[docs] def classify(self, instances): + """ + Provides the posterior probabilities for the given instances. If the classifier was required + to be recalibrated, then these posteriors are recalibrated accordingly. + + :param instances: array-like of shape `(n_instances, n_dimensions,)` + :return: np.ndarray of shape `(n_instances, n_classes,)` with posterior probabilities + """ + posteriors = self.classifier.predict_proba(instances) + if hasattr(self, 'calibration_function') and self.calibration_function is not None: + posteriors = self.calibration_function(posteriors) + return posteriors
+ +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + if self.recalib is not None: + P, y = classif_predictions.Xy + if self.recalib == 'nbvs': + calibrator = NoBiasVectorScaling() + elif self.recalib == 'bcts': + calibrator = TempScaling(bias_positions='all') + elif self.recalib == 'ts': + calibrator = TempScaling() + elif self.recalib == 'vs': + calibrator = VectorScaling() + else: + raise ValueError('invalid param argument for recalibration method; available ones are ' + '"nbvs", "bcts", "ts", and "vs".') + + self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True) + + if self.exact_train_prev: + self.train_prevalence = data.prevalence() + else: + train_posteriors = classif_predictions.X + if self.recalib is not None: + train_posteriors = self.calibration_function(train_posteriors) + self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
+ +
[docs] def aggregate(self, classif_posteriors, epsilon=EPSILON): + priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) + return priors
+ +
[docs] def predict_proba(self, instances, epsilon=EPSILON): + """ + Returns the posterior probabilities updated by the EM algorithm. + + :param instances: np.ndarray of shape `(n_instances, n_dimensions)` + :param epsilon: error tolerance + :return: np.ndarray of shape `(n_instances, n_classes)` + """ + classif_posteriors = self.classify(instances) + priors, posteriors = self.EM(self.train_prevalence, classif_posteriors, epsilon) + return posteriors
+ +
[docs] @classmethod + def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON): + """ + Computes the `Expectation Maximization` routine. + + :param tr_prev: array-like, the training prevalence + :param posterior_probabilities: `np.ndarray` of shape `(n_instances, n_classes,)` with the + posterior probabilities + :param epsilon: float, the threshold different between two consecutive iterations + to reach before stopping the loop + :return: a tuple with the estimated prevalence values (shape `(n_classes,)`) and + the corrected posterior probabilities (shape `(n_instances, n_classes,)`) + """ + Px = posterior_probabilities + Ptr = np.copy(tr_prev) + qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence + + s, converged = 0, False + qs_prev_ = None + while not converged and s < EMQ.MAX_ITER: + # E-step: ps is Ps(y|xi) + ps_unnormalized = (qs / Ptr) * Px + ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True) + + # M-step: + qs = ps.mean(axis=0) + + if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s > 10: + converged = True + + qs_prev_ = qs + s += 1 + + if not converged: + print('[warning] the method has reached the maximum number of iterations; it might have not converged') + + return qs, ps
+ + +
[docs]class HDy(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): + """ + `Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy). + HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of + minimizing the divergence (in terms of the Hellinger Distance) between two distributions of posterior + probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and + the other is generated from a validation set. This latter distribution is defined as a mixture of the + class-conditional distributions of the posterior probabilities returned for the positive and negative validation + examples, respectively. The parameters of the mixture thus represent the estimates of the class prevalence values. + + :param classifier: a sklearn's Estimator that generates a binary classifier + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5).. + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + self.classifier = classifier + self.val_split = val_split + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains a HDy quantifier. + + :param data: the training set + :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) + :param val_split: either a float in (0,1) indicating the proportion of training instances to use for + validation (e.g., 0.3 for using 30% of the training set as validation data), or a + :class:`quapy.data.base.LabelledCollection` indicating the validation set itself + :return: self + """ + P, y = classif_predictions.Xy + Px = P[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] + + # pre-compute the histogram for positive and negative examples + self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110] + + def hist(P, bins): + h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0] + return h / h.sum() + + self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins} + self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins} + + return self
+ +
[docs] def aggregate(self, classif_posteriors): + # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10, + # and the final estimated a priori probability was taken as the median of these 11 estimates." + # (González-Castro, et al., 2013). + + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) + + prev_estimations = [] + # for bins in np.linspace(10, 110, 11, dtype=int): #[10, 20, 30, ..., 100, 110] + # Pxy0_density, _ = np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True) + # Pxy1_density, _ = np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True) + for bins in self.bins: + Pxy0_density = self.Pxy0_density[bins] + Pxy1_density = self.Pxy1_density[bins] + + Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True) + + # the authors proposed to search for the prevalence yielding the best matching as a linear search + # at small steps (modern implementations resort to an optimization procedure, + # see class DistributionMatching) + prev_selected, min_dist = None, None + for prev in F.prevalence_linspace(n_prevalences=101, repeats=1, smooth_limits_epsilon=0.0): + Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density + hdy = F.HellingerDistance(Px_train, Px_test) + if prev_selected is None or hdy < min_dist: + prev_selected, min_dist = prev, hdy + prev_estimations.append(prev_selected) + + class1_prev = np.median(prev_estimations) + return F.as_binary_prevalence(class1_prev)
+ + +
[docs]class DyS(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): + """ + `DyS framework <https://ojs.aaai.org/index.php/AAAI/article/view/4376>`_ (DyS). + DyS is a generalization of HDy method, using a Ternary Search in order to find the prevalence that + minimizes the distance between distributions. + Details for the ternary search have been got from <https://dl.acm.org/doi/pdf/10.1145/3219819.3220059> + + :param classifier: a sklearn's Estimator that generates a binary classifier + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5).. + :param n_bins: an int with the number of bins to use to compute the histograms. + :param divergence: a str indicating the name of divergence (currently supported ones are "HD" or "topsoe"), or a + callable function computes the divergence between two distributions (two equally sized arrays). + :param tol: a float with the tolerance for the ternary search algorithm. + :param n_jobs: number of parallel workers. + """ + + def __init__(self, classifier: BaseEstimator, val_split=5, n_bins=8, divergence: Union[str, Callable]= 'HD', tol=1e-05, n_jobs=None): + self.classifier = classifier + self.val_split = val_split + self.tol = tol + self.divergence = divergence + self.n_bins = n_bins + self.n_jobs = n_jobs + + def _ternary_search(self, f, left, right, tol): + """ + Find maximum of unimodal function f() within [left, right] + """ + while abs(right - left) >= tol: + left_third = left + (right - left) / 3 + right_third = right - (right - left) / 3 + + if f(left_third) > f(right_third): + left = left_third + else: + right = right_third + + # Left and right are the current bounds; the maximum is between them + return (left + right) / 2 + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + Px, y = classif_predictions.Xy + Px = Px[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] + self.Pxy1_density = np.histogram(self.Pxy1, bins=self.n_bins, range=(0, 1), density=True)[0] + self.Pxy0_density = np.histogram(self.Pxy0, bins=self.n_bins, range=(0, 1), density=True)[0] + return self
+ +
[docs] def aggregate(self, classif_posteriors): + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) + + Px_test = np.histogram(Px, bins=self.n_bins, range=(0, 1), density=True)[0] + divergence = get_divergence(self.divergence) + + def distribution_distance(prev): + Px_train = prev * self.Pxy1_density + (1 - prev) * self.Pxy0_density + return divergence(Px_train, Px_test) + + class1_prev = self._ternary_search(f=distribution_distance, left=0, right=1, tol=self.tol) + return F.as_binary_prevalence(class1_prev)
+ + +
[docs]class SMM(AggregativeSoftQuantifier, BinaryAggregativeQuantifier): + """ + `SMM method <https://ieeexplore.ieee.org/document/9260028>`_ (SMM). + SMM is a simplification of matching distribution methods where the representation of the examples + is created using the mean instead of a histogram (conceptually equivalent to PACC). + + :param classifier: a sklearn's Estimator that generates a binary classifier. + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation distribution, or a :class:`quapy.data.base.LabelledCollection` (the split itself), or an integer indicating the number of folds (default 5).. + """ + + def __init__(self, classifier: BaseEstimator, val_split=5): + self.classifier = classifier + self.val_split = val_split + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + Px, y = classif_predictions.Xy + Px = Px[:, self.pos_label] # takes only the P(y=+1|x) + self.Pxy1 = Px[y == self.pos_label] + self.Pxy0 = Px[y == self.neg_label] + self.Pxy1_mean = np.mean(self.Pxy1) # equiv. TPR + self.Pxy0_mean = np.mean(self.Pxy0) # equiv. FPR + return self
+ +
[docs] def aggregate(self, classif_posteriors): + Px = classif_posteriors[:, self.pos_label] # takes only the P(y=+1|x) + Px_mean = np.mean(Px) + + class1_prev = (Px_mean - self.Pxy0_mean)/(self.Pxy1_mean - self.Pxy0_mean) + return F.as_binary_prevalence(class1_prev, clip_if_necessary=True)
+ + +
[docs]class DMy(AggregativeSoftQuantifier): + """ + Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior + probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF + as hyperparameters. + + :param classifier: a `sklearn`'s Estimator that generates a probabilistic classifier + :param val_split: indicates the proportion of data to be used as a stratified held-out validation set to model the + validation distribution. + This parameter can be indicated as a real value (between 0 and 1), representing a proportion of + validation data, or as an integer, indicating that the validation distribution should be estimated via + `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a + :class:`quapy.data.base.LabelledCollection` (the split itself). + :param nbins: number of bins used to discretize the distributions (default 8) + :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented) + or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger + Distance) + :param cdf: whether to use CDF instead of PDF (default False) + :param n_jobs: number of parallel workers (default None) + """ + + def __init__(self, classifier, val_split=5, nbins=8, divergence: Union[str, Callable]='HD', + cdf=False, search='optim_minimize', n_jobs=None): + self.classifier = classifier + self.val_split = val_split + self.nbins = nbins + self.divergence = divergence + self.cdf = cdf + self.search = search + self.n_jobs = n_jobs + + # @classmethod + # def HDy(cls, classifier, val_split=5, n_jobs=None): + # from quapy.method.meta import MedianEstimator + # + # hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD') + # hdy = AggregativeMedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs) + # return hdy + + def _get_distributions(self, posteriors): + histograms = [] + post_dims = posteriors.shape[1] + if post_dims == 2: + # in binary quantification we can use only one class, since the other one is its complement + post_dims = 1 + for dim in range(post_dims): + hist = np.histogram(posteriors[:, dim], bins=self.nbins, range=(0, 1))[0] + histograms.append(hist) + + counts = np.vstack(histograms) + distributions = counts/counts.sum(axis=1)[:,np.newaxis] + if self.cdf: + distributions = np.cumsum(distributions, axis=1) + return distributions + +
[docs] def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + """ + Trains the classifier (if requested) and generates the validation distributions out of the training data. + The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of + channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; then `di=V[i]` + are the distributions obtained from training data labelled with class `i`; while `dij = di[j]` is the discrete + distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]` + is the fraction of instances with a value in the `k`-th bin. + + :param data: the training set + :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) + :param val_split: either a float in (0,1) indicating the proportion of training instances to use for + validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection + indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV + to estimate the parameters + """ + posteriors, true_labels = classif_predictions.Xy + n_classes = len(self.classifier.classes_) + + self.validation_distribution = qp.util.parallel( + func=self._get_distributions, + args=[posteriors[true_labels==cat] for cat in range(n_classes)], + n_jobs=self.n_jobs, + backend='threading' + )
+ +
[docs] def aggregate(self, posteriors: np.ndarray): + """ + Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution + (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. + In the multiclass case, with `n` the number of classes, the test and mixture distributions contain + `n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed + independently. The matching is computed as an average of the divergence across all channels. + + :param posteriors: posterior probabilities of the instances in the sample + :return: a vector of class prevalence estimates + """ + test_distribution = self._get_distributions(posteriors) + divergence = get_divergence(self.divergence) + n_classes, n_channels, nbins = self.validation_distribution.shape + def loss(prev): + prev = np.expand_dims(prev, axis=0) + mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1) + divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)] + return np.mean(divs) + + return F.argmin_prevalence(loss, n_classes, method=self.search)
+ + + +
[docs]def newELM(svmperf_base=None, loss='01', C=1): + """ + Explicit Loss Minimization (ELM) quantifiers. + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function equivalent to: + + >>> CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param loss: the loss to optimize (see :attr:`quapy.classification.svmperf.SVMperf.valid_losses`) + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + if svmperf_base is None: + svmperf_base = qp.environ['SVMPERF_HOME'] + assert svmperf_base is not None, \ + 'param svmperf_base was not specified, and the variable SVMPERF_HOME has not been set in the environment' + return CC(SVMperf(svmperf_base, loss=loss, C=C))
+ + +
[docs]def newSVMQ(svmperf_base=None, C=1): + """ + SVM(Q) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the `Q` loss combining a + classification-oriented loss and a quantification-oriented loss, as proposed by + `Barranquero et al. 2015 <https://www.sciencedirect.com/science/article/pii/S003132031400291X>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='q', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='q', C=C)
+ +def newSVMKLD(svmperf_base=None, C=1): + """ + SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence + as proposed by `Esuli et al. 2015 <https://dl.acm.org/doi/abs/10.1145/2700406>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='kld', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='kld', C=C) + + +
[docs]def newSVMKLD(svmperf_base=None, C=1): + """ + SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Kullback-Leibler Divergence + normalized via the logistic function, as proposed by + `Esuli et al. 2015 <https://dl.acm.org/doi/abs/10.1145/2700406>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='nkld', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='nkld', C=C)
+ +
[docs]def newSVMAE(svmperf_base=None, C=1): + """ + SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Absolute Error as first used by + `Moreo and Sebastiani, 2021 <https://arxiv.org/abs/2011.02552>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='mae', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='mae', C=C)
+ +
[docs]def newSVMRAE(svmperf_base=None, C=1): + """ + SVM(KLD) is an Explicit Loss Minimization (ELM) quantifier set to optimize for the Relative Absolute Error as first + used by `Moreo and Sebastiani, 2021 <https://arxiv.org/abs/2011.02552>`_. + Equivalent to: + + >>> CC(SVMperf(svmperf_base, loss='mrae', C=C)) + + Quantifiers based on ELM represent a family of methods based on structured output learning; + these quantifiers rely on classifiers that have been optimized using a quantification-oriented loss + measure. This implementation relies on + `Joachims’ SVM perf <https://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html>`_ structured output + learning algorithm, which has to be installed and patched for the purpose (see this + `script <https://github.com/HLT-ISTI/QuaPy/blob/master/prepare_svmperf.sh>`_). + This function is a wrapper around CC(SVMperf(svmperf_base, loss, C)) + + :param svmperf_base: path to the folder containing the binary files of `SVM perf`; if set to None (default) + this path will be obtained from qp.environ['SVMPERF_HOME'] + :param C: trade-off between training error and margin (default 0.01) + :return: returns an instance of CC set to work with SVMperf (with loss and C set properly) as the + underlying classifier + """ + return newELM(svmperf_base, loss='mrae', C=C)
+ + +
[docs]class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): + """ + Allows any binary quantifier to perform quantification on single-label datasets. + The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the + class prevelences sum up to 1. + This variant was used, along with the :class:`EMQ` quantifier, in + `Gao and Sebastiani, 2016 <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_. + + :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a + one-vs-all manner + :param n_jobs: number of parallel workers + :param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers + (e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will + is removed and no longer available at predict time. + """ + + def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='multiprocessing'): + assert isinstance(binary_quantifier, BaseQuantifier), \ + f'{self.binary_quantifier} does not seem to be a Quantifier' + assert isinstance(binary_quantifier, AggregativeQuantifier), \ + f'{self.binary_quantifier} does not seem to be of type Aggregative' + self.binary_quantifier = binary_quantifier + self.n_jobs = qp._get_njobs(n_jobs) + self.parallel_backend = parallel_backend + +
[docs] def classify(self, instances): + """ + If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of + instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance + `i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance + can end up be attributed to 0, 1, or more classes. + If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances + and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the + posterior probability that instance `i` belongs (resp. does not belong) to class `j`. The posterior + probabilities are independent of each other, meaning that, in general, they do not sum up to one. + + :param instances: array-like + :return: `np.ndarray` + """ + + classif_predictions = self._parallel(self._delayed_binary_classification, instances) + if isinstance(self.binary_quantifier, AggregativeSoftQuantifier): + return np.swapaxes(classif_predictions, 0, 1) + else: + return classif_predictions.T
+ +
[docs] def aggregate(self, classif_predictions): + prevalences = self._parallel(self._delayed_binary_aggregate, classif_predictions) + return F.normalize_prevalence(prevalences)
+ + def _delayed_binary_classification(self, c, X): + return self.dict_binary_quantifiers[c].classify(X) + + def _delayed_binary_aggregate(self, c, classif_predictions): + # the estimation for the positive class prevalence + return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
+ + +
[docs]class AggregativeMedianEstimator(BinaryQuantifier): + """ + This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the + estimation returned by differently (hyper)parameterized base quantifiers. + The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions, + i.e., in cases of binary quantification. + + :param base_quantifier: the base, binary quantifier + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param param_grid: the grid or parameters towards which the median will be computed + :param n_jobs: number of parllel workes + """ + def __init__(self, base_quantifier: AggregativeQuantifier, param_grid: dict, random_state=None, n_jobs=None): + self.base_quantifier = base_quantifier + self.param_grid = param_grid + self.random_state = random_state + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] def get_params(self, deep=True): + return self.base_quantifier.get_params(deep)
+ +
[docs] def set_params(self, **params): + self.base_quantifier.set_params(**params)
+ + def _delayed_fit(self, args): + with qp.util.temp_seed(self.random_state): + params, training = args + model = deepcopy(self.base_quantifier) + model.set_params(**params) + model.fit(training) + return model + + def _delayed_fit_classifier(self, args): + with qp.util.temp_seed(self.random_state): + print('enter job') + cls_params, training, kwargs = args + model = deepcopy(self.base_quantifier) + model.set_params(**cls_params) + predictions = model.classifier_fit_predict(training, **kwargs) + print('exit job') + return (model, predictions) + + def _delayed_fit_aggregation(self, args): + with qp.util.temp_seed(self.random_state): + ((model, predictions), q_params), training = args + model = deepcopy(model) + model.set_params(**q_params) + model.aggregation_fit(predictions, training) + return model + + +
[docs] def fit(self, training: LabelledCollection, **kwargs): + import itertools + + self._check_binary(training, self.__class__.__name__) + + if isinstance(self.base_quantifier, AggregativeQuantifier): + cls_configs, q_configs = qp.model_selection.group_params(self.param_grid) + + if len(cls_configs) > 1: + models_preds = qp.util.parallel( + self._delayed_fit_classifier, + ((params, training, kwargs) for params in cls_configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False, + backend='threading' + ) + else: + print('only 1') + model = self.base_quantifier + model.set_params(**cls_configs[0]) + predictions = model.classifier_fit_predict(training, **kwargs) + models_preds = [(model, predictions)] + + self.models = qp.util.parallel( + self._delayed_fit_aggregation, + ((setup, training) for setup in itertools.product(models_preds, q_configs)), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + else: + configs = qp.model_selection.expand_grid(self.param_grid) + self.models = qp.util.parallel( + self._delayed_fit, + ((params, training) for params in configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + return self
+ + def _delayed_predict(self, args): + model, instances = args + return model.quantify(instances) + +
[docs] def quantify(self, instances): + prev_preds = qp.util.parallel( + self._delayed_predict, + ((model, instances) for model in self.models), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + backend='threading' + ) + return np.median(prev_preds, axis=0)
+ + +#--------------------------------------------------------------- +# imports +#--------------------------------------------------------------- + +from . import _threshold_optim + +T50 = _threshold_optim.T50 +MAX = _threshold_optim.MAX +X = _threshold_optim.X +MS = _threshold_optim.MS +MS2 = _threshold_optim.MS2 + + +from . import _kdey + +KDEyML = _kdey.KDEyML +KDEyHD = _kdey.KDEyHD +KDEyCS = _kdey.KDEyCS + +#--------------------------------------------------------------- +# aliases +#--------------------------------------------------------------- + +ClassifyAndCount = CC +AdjustedClassifyAndCount = ACC +ProbabilisticClassifyAndCount = PCC +ProbabilisticAdjustedClassifyAndCount = PACC +ExpectationMaximizationQuantifier = EMQ +DistributionMatchingY = DMy +SLD = EMQ +HellingerDistanceY = HDy +MedianSweep = MS +MedianSweep2 = MS2 +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/base.html b/docs/build/html/_modules/quapy/method/base.html new file mode 100644 index 0000000..6288bd1 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/base.html @@ -0,0 +1,212 @@ + + + + + + quapy.method.base — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method.base

+from abc import ABCMeta, abstractmethod
+from copy import deepcopy
+
+from joblib import Parallel, delayed
+from sklearn.base import BaseEstimator
+
+import quapy as qp
+from quapy.data import LabelledCollection
+import numpy as np
+
+
+# Base Quantifier abstract class
+# ------------------------------------
+
[docs]class BaseQuantifier(BaseEstimator): + """ + Abstract Quantifier. A quantifier is defined as an object of a class that implements the method :meth:`fit` on + :class:`quapy.data.base.LabelledCollection`, the method :meth:`quantify`, and the :meth:`set_params` and + :meth:`get_params` for model selection (see :meth:`quapy.model_selection.GridSearchQ`) + """ + +
[docs] @abstractmethod + def fit(self, data: LabelledCollection): + """ + Trains a quantifier. + + :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :return: self + """ + ...
+ +
[docs] @abstractmethod + def quantify(self, instances): + """ + Generate class prevalence estimates for the sample's instances + + :param instances: array-like + :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. + """ + ...
+ + +
[docs]class BinaryQuantifier(BaseQuantifier): + """ + Abstract class of binary quantifiers, i.e., quantifiers estimating class prevalence values for only two classes + (typically, to be interpreted as one class and its complement). + """ + + def _check_binary(self, data: LabelledCollection, quantifier_name): + assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \ + f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
+ + +
[docs]class OneVsAll: + pass
+ + +
[docs]def newOneVsAll(binary_quantifier, n_jobs=None): + assert isinstance(binary_quantifier, BaseQuantifier), \ + f'{binary_quantifier} does not seem to be a Quantifier' + if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier): + return qp.method.aggregative.OneVsAllAggregative(binary_quantifier, n_jobs) + else: + return OneVsAllGeneric(binary_quantifier, n_jobs)
+ + +
[docs]class OneVsAllGeneric(OneVsAll, BaseQuantifier): + """ + Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary + quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1. + """ + + def __init__(self, binary_quantifier, n_jobs=None): + assert isinstance(binary_quantifier, BaseQuantifier), \ + f'{binary_quantifier} does not seem to be a Quantifier' + if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier): + print('[warning] the quantifier seems to be an instance of qp.method.aggregative.AggregativeQuantifier; ' + f'you might prefer instantiating {qp.method.aggregative.OneVsAllAggregative.__name__}') + self.binary_quantifier = binary_quantifier + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] def fit(self, data: LabelledCollection, fit_classifier=True): + assert not data.binary, f'{self.__class__.__name__} expect non-binary data' + assert fit_classifier == True, 'fit_classifier must be True' + + self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} + self._parallel(self._delayed_binary_fit, data) + return self
+ + def _parallel(self, func, *args, **kwargs): + return np.asarray( + Parallel(n_jobs=self.n_jobs, backend='threading')( + delayed(func)(c, *args, **kwargs) for c in self.classes_ + ) + ) + +
[docs] def quantify(self, instances): + prevalences = self._parallel(self._delayed_binary_predict, instances) + return qp.functional.normalize_prevalence(prevalences)
+ + @property + def classes_(self): + return sorted(self.dict_binary_quantifiers.keys()) + + def _delayed_binary_predict(self, c, X): + return self.dict_binary_quantifiers[c].quantify(X)[1] + + def _delayed_binary_fit(self, c, data): + bindata = LabelledCollection(data.instances, data.labels == c, classes=[False, True]) + self.dict_binary_quantifiers[c].fit(bindata)
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/meta.html b/docs/build/html/_modules/quapy/method/meta.html new file mode 100644 index 0000000..ca38440 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/meta.html @@ -0,0 +1,796 @@ + + + + + + quapy.method.meta — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method.meta

+import itertools
+from copy import deepcopy
+from typing import Union
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import f1_score, make_scorer, accuracy_score
+from sklearn.model_selection import GridSearchCV, cross_val_predict
+from tqdm import tqdm
+
+import quapy as qp
+from quapy import functional as F
+from quapy.data import LabelledCollection
+from quapy.model_selection import GridSearchQ
+from quapy.method.base import BaseQuantifier, BinaryQuantifier
+from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ, AggregativeQuantifier
+
+try:
+    from . import _neural
+except ModuleNotFoundError:
+    _neural = None
+
+
+if _neural:
+    QuaNet = _neural.QuaNetTrainer
+else:
+    QuaNet = "QuaNet is not available due to missing torch package"
+
+
+
[docs]class MedianEstimator2(BinaryQuantifier): + """ + This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the + estimation returned by differently (hyper)parameterized base quantifiers. + The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions, + i.e., in cases of binary quantification. + + :param base_quantifier: the base, binary quantifier + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param param_grid: the grid or parameters towards which the median will be computed + :param n_jobs: number of parllel workes + """ + def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None): + self.base_quantifier = base_quantifier + self.param_grid = param_grid + self.random_state = random_state + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] def get_params(self, deep=True): + return self.base_quantifier.get_params(deep)
+ +
[docs] def set_params(self, **params): + self.base_quantifier.set_params(**params)
+ + def _delayed_fit(self, args): + with qp.util.temp_seed(self.random_state): + params, training = args + model = deepcopy(self.base_quantifier) + model.set_params(**params) + model.fit(training) + return model + +
[docs] def fit(self, training: LabelledCollection): + self._check_binary(training, self.__class__.__name__) + + configs = qp.model_selection.expand_grid(self.param_grid) + self.models = qp.util.parallel( + self._delayed_fit, + ((params, training) for params in configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + return self
+ + def _delayed_predict(self, args): + model, instances = args + return model.quantify(instances) + +
[docs] def quantify(self, instances): + prev_preds = qp.util.parallel( + self._delayed_predict, + ((model, instances) for model in self.models), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + prev_preds = np.asarray(prev_preds) + return np.median(prev_preds, axis=0)
+ + +
[docs]class MedianEstimator(BinaryQuantifier): + """ + This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the + estimation returned by differently (hyper)parameterized base quantifiers. + The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions, + i.e., in cases of binary quantification. + + :param base_quantifier: the base, binary quantifier + :param random_state: a seed to be set before fitting any base quantifier (default None) + :param param_grid: the grid or parameters towards which the median will be computed + :param n_jobs: number of parllel workes + """ + def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None): + self.base_quantifier = base_quantifier + self.param_grid = param_grid + self.random_state = random_state + self.n_jobs = qp._get_njobs(n_jobs) + +
[docs] def get_params(self, deep=True): + return self.base_quantifier.get_params(deep)
+ +
[docs] def set_params(self, **params): + self.base_quantifier.set_params(**params)
+ + def _delayed_fit(self, args): + with qp.util.temp_seed(self.random_state): + params, training = args + model = deepcopy(self.base_quantifier) + model.set_params(**params) + model.fit(training) + return model + + def _delayed_fit_classifier(self, args): + with qp.util.temp_seed(self.random_state): + cls_params, training = args + model = deepcopy(self.base_quantifier) + model.set_params(**cls_params) + predictions = model.classifier_fit_predict(training, predict_on=model.val_split) + return (model, predictions) + + def _delayed_fit_aggregation(self, args): + with qp.util.temp_seed(self.random_state): + ((model, predictions), q_params), training = args + model = deepcopy(model) + model.set_params(**q_params) + model.aggregation_fit(predictions, training) + return model + + +
[docs] def fit(self, training: LabelledCollection): + self._check_binary(training, self.__class__.__name__) + + if isinstance(self.base_quantifier, AggregativeQuantifier): + cls_configs, q_configs = qp.model_selection.group_params(self.param_grid) + + if len(cls_configs) > 1: + models_preds = qp.util.parallel( + self._delayed_fit_classifier, + ((params, training) for params in cls_configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + else: + model = self.base_quantifier + model.set_params(**cls_configs[0]) + predictions = model.classifier_fit_predict(training, predict_on=model.val_split) + models_preds = [(model, predictions)] + + self.models = qp.util.parallel( + self._delayed_fit_aggregation, + ((setup, training) for setup in itertools.product(models_preds, q_configs)), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + else: + configs = qp.model_selection.expand_grid(self.param_grid) + self.models = qp.util.parallel( + self._delayed_fit, + ((params, training) for params in configs), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + return self
+ + def _delayed_predict(self, args): + model, instances = args + return model.quantify(instances) + +
[docs] def quantify(self, instances): + prev_preds = qp.util.parallel( + self._delayed_predict, + ((model, instances) for model in self.models), + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs, + asarray=False + ) + prev_preds = np.asarray(prev_preds) + return np.median(prev_preds, axis=0)
+ + +
[docs]class Ensemble(BaseQuantifier): + VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES + + """ + Implementation of the Ensemble methods for quantification described by + `Pérez-Gállego et al., 2017 <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_ + and + `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + The policies implemented include: + + - Average (`policy='ave'`): computes class prevalence estimates as the average of the estimates + returned by the base quantifiers. + - Training Prevalence (`policy='ptr'`): applies a dynamic selection to the ensemble’s members by retaining only + those members such that the class prevalence values in the samples they use as training set are closest to + preliminary class prevalence estimates computed as the average of the estimates of all the members. The final + estimate is recomputed by considering only the selected members. + - Distribution Similarity (`policy='ds'`): performs a dynamic selection of base members by retaining + the members trained on samples whose distribution of posterior probabilities is closest, in terms of the + Hellinger Distance, to the distribution of posterior probabilities in the test sample + - Accuracy (`policy='<valid error name>'`): performs a static selection of the ensemble members by + retaining those that minimize a quantification error measure, which is passed as an argument. + + Example: + + >>> model = Ensemble(quantifier=ACC(LogisticRegression()), size=30, policy='ave', n_jobs=-1) + + :param quantifier: base quantification member of the ensemble + :param size: number of members + :param red_size: number of members to retain after selection (depending on the policy) + :param min_pos: minimum number of positive instances to consider a sample as valid + :param policy: the selection policy; available policies include: `ave` (default), `ptr`, `ds`, and accuracy + (which is instantiated via a valid error name, e.g., `mae`) + :param max_sample_size: maximum number of instances to consider in the samples (set to None + to indicate no limit, default) + :param val_split: a float in range (0,1) indicating the proportion of data to be used as a stratified held-out + validation split, or a :class:`quapy.data.base.LabelledCollection` (the split itself). + :param n_jobs: number of parallel workers (default 1) + :param verbose: set to True (default is False) to get some information in standard output + """ + + def __init__(self, + quantifier: BaseQuantifier, + size=50, + red_size=25, + min_pos=5, + policy='ave', + max_sample_size=None, + val_split:Union[qp.data.LabelledCollection, float]=None, + n_jobs=None, + verbose=False): + assert policy in Ensemble.VALID_POLICIES, \ + f'unknown policy={policy}; valid are {Ensemble.VALID_POLICIES}' + assert max_sample_size is None or max_sample_size > 0, \ + 'wrong value for max_sample_size; set it to a positive number or None' + self.base_quantifier = quantifier + self.size = size + self.min_pos = min_pos + self.red_size = red_size + self.policy = policy + self.val_split = val_split + self.n_jobs = qp._get_njobs(n_jobs) + self.post_proba_fn = None + self.verbose = verbose + self.max_sample_size = max_sample_size + + def _sout(self, msg): + if self.verbose: + print('[Ensemble]' + msg) + +
[docs] def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float] = None): + + if self.policy == 'ds' and not data.binary: + raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary') + + if val_split is None: + val_split = self.val_split + + # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than + # min_pos positive examples) + sample_size = len(data) if self.max_sample_size is None else min(self.max_sample_size, len(data)) + prevs = [_draw_simplex(ndim=data.n_classes, min_val=self.min_pos / sample_size) for _ in range(self.size)] + + posteriors = None + if self.policy == 'ds': + # precompute the training posterior probabilities + posteriors, self.post_proba_fn = self._ds_policy_get_posteriors(data) + + is_static_policy = (self.policy in qp.error.QUANTIFICATION_ERROR_NAMES) + + args = ( + (self.base_quantifier, data, val_split, prev, posteriors, is_static_policy, self.verbose, sample_size) + for prev in prevs + ) + self.ensemble = qp.util.parallel( + _delayed_new_instance, + tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args, + asarray=False, + n_jobs=self.n_jobs) + + # static selection policy (the name of a quantification-oriented error function to minimize) + if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES: + self._accuracy_policy(error_name=self.policy) + + self._sout('Fit [Done]') + return self
+ +
[docs] def quantify(self, instances): + predictions = np.asarray( + qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs) + ) + + if self.policy == 'ptr': + predictions = self._ptr_policy(predictions) + elif self.policy == 'ds': + predictions = self._ds_policy(predictions, instances) + + predictions = np.mean(predictions, axis=0) + return F.normalize_prevalence(predictions)
+ +
[docs] def set_params(self, **parameters): + """ + This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility + with the abstract class). + Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or + `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a classifier `l` optimized for + classification (not recommended). + + :param parameters: dictionary + :return: raises an Exception + """ + raise NotImplementedError(f'{self.__class__.__name__} should not be used within GridSearchQ; ' + f'instead, use Ensemble(GridSearchQ(q),...), with q a Quantifier (recommended), ' + f'or Ensemble(Q(GridSearchCV(l))) with Q a quantifier class that has a classifier ' + f'l optimized for classification (not recommended).')
+ +
[docs] def get_params(self, deep=True): + """ + This function should not be used within :class:`quapy.model_selection.GridSearchQ` (is here for compatibility + with the abstract class). + Instead, use `Ensemble(GridSearchQ(q),...)`, with `q` a Quantifier (recommended), or + `Ensemble(Q(GridSearchCV(l)))` with `Q` a quantifier class that has a classifier `l` optimized for + classification (not recommended). + + :param deep: for compatibility with scikit-learn + :return: raises an Exception + """ + + raise NotImplementedError()
+ + def _accuracy_policy(self, error_name): + """ + Selects the red_size best performant quantifiers in a static way (i.e., dropping all non-selected instances). + For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of + the samples used for training the rest of the models in the ensemble. + """ + from quapy.evaluation import evaluate_on_samples + error = qp.error.from_name(error_name) + tests = [m[3] for m in self.ensemble] + scores = [] + for i, model in enumerate(self.ensemble): + scores.append(evaluate_on_samples(model[0], tests[:i] + tests[i + 1:], error)) + order = np.argsort(scores) + + self.ensemble = _select_k(self.ensemble, order, k=self.red_size) + + def _ptr_policy(self, predictions): + """ + Selects the predictions made by models that have been trained on samples with a prevalence that is most similar + to a first approximation of the test prevalence as made by all models in the ensemble. + """ + test_prev_estim = predictions.mean(axis=0) + tr_prevs = [m[1] for m in self.ensemble] + ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs] + order = np.argsort(ptr_differences) + return _select_k(predictions, order, k=self.red_size) + + def _ds_policy_get_posteriors(self, data: LabelledCollection): + """ + In the original article, there are some aspects regarding this method that are not mentioned. The paper says + that the distribution of posterior probabilities from training and test examples is compared by means of the + Hellinger Distance. However, how these posterior probabilities are generated is not specified. In the article, + a Logistic Regressor (LR) is used as the classifier device and that could be used for this purpose. However, in + general, a Quantifier is not necessarily an instance of Aggreggative Probabilistic Quantifiers, and so, that the + quantifier builds on top of a probabilistic classifier cannot be given for granted. Additionally, it would not + be correct to generate the posterior probabilities for training instances that have concurred in training the + classifier that generates them. + + This function thus generates the posterior probabilities for all training documents in a cross-validation way, + using LR with hyperparameters that have previously been optimized via grid search in 5FCV. + + :param data: a LabelledCollection + :return: (P,f,) where P is an ndarray containing the posterior probabilities of the training data, generated via + cross-validation and using an optimized LR, and the function to be used in order to generate posterior + probabilities for test instances. + """ + + X, y = data.Xy + lr_base = LogisticRegression(class_weight='balanced', max_iter=1000) + + param_grid = {'C': np.logspace(-4, 4, 9)} + optim = GridSearchCV(lr_base, param_grid=param_grid, cv=5, n_jobs=self.n_jobs, refit=True).fit(X, y) + + posteriors = cross_val_predict(optim.best_estimator_, X, y, cv=5, n_jobs=self.n_jobs, method='predict_proba') + posteriors_generator = optim.best_estimator_.predict_proba + + return posteriors, posteriors_generator + + def _ds_policy(self, predictions, test): + test_posteriors = self.post_proba_fn(test) + test_distribution = get_probability_distribution(test_posteriors) + tr_distributions = [m[2] for m in self.ensemble] + dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions] + order = np.argsort(dist) + return _select_k(predictions, order, k=self.red_size) + + @property + def aggregative(self): + """ + Indicates that the quantifier is not aggregative. + + :return: False + """ + return False + + @property + def probabilistic(self): + """ + Indicates that the quantifier is not probabilistic. + + :return: False + """ + return False
+ + +
[docs]def get_probability_distribution(posterior_probabilities, bins=8): + """ + Gets a histogram out of the posterior probabilities (only for the binary case). + + :param posterior_probabilities: array-like of shape `(n_instances, 2,)` + :param bins: integer + :return: `np.ndarray` with the relative frequencies for each bin (for the positive class only) + """ + assert posterior_probabilities.shape[1] == 2, 'the posterior probabilities do not seem to be for a binary problem' + posterior_probabilities = posterior_probabilities[:, 1] # take the positive posteriors only + distribution, _ = np.histogram(posterior_probabilities, bins=bins, range=(0, 1), density=True) + return distribution
+ + +def _select_k(elements, order, k): + return [elements[idx] for idx in order[:k]] + + +def _delayed_new_instance(args): + base_quantifier, data, val_split, prev, posteriors, keep_samples, verbose, sample_size = args + if verbose: + print(f'\tfit-start for prev {F.strprev(prev)}, sample_size={sample_size}') + model = deepcopy(base_quantifier) + + if val_split is not None: + if isinstance(val_split, float): + assert 0 < val_split < 1, 'val_split should be in (0,1)' + data, val_split = data.split_stratified(train_prop=1 - val_split) + + sample_index = data.sampling_index(sample_size, *prev) + sample = data.sampling_from_index(sample_index) + + if val_split is not None: + model.fit(sample, val_split=val_split) + else: + model.fit(sample) + + tr_prevalence = sample.prevalence() + tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None + + if verbose: + print(f'\t\--fit-ended for prev {F.strprev(prev)}') + + return (model, tr_prevalence, tr_distribution, sample if keep_samples else None) + + +def _delayed_quantify(args): + quantifier, instances = args + return quantifier[0].quantify(instances) + + +def _draw_simplex(ndim, min_val, max_trials=100): + """ + Returns a uniform sampling from the ndim-dimensional simplex but guarantees that all dimensions + are >= min_class_prev (for min_val>0, this makes the sampling not truly uniform) + + :param ndim: number of dimensions of the simplex + :param min_val: minimum class prevalence allowed. If less than 1/ndim a ValueError will be throw since + there is no possible solution. + :return: a sample from the ndim-dimensional simplex that is uniform in S(ndim)-R where S(ndim) is the simplex + and R is the simplex subset containing dimensions lower than min_val + """ + if min_val >= 1 / ndim: + raise ValueError(f'no sample can be draw from the {ndim}-dimensional simplex so that ' + f'all its values are >={min_val} (try with a larger value for min_pos)') + trials = 0 + while True: + u = F.uniform_simplex_sampling(ndim) + if all(u >= min_val): + return u + trials += 1 + if trials >= max_trials: + raise ValueError(f'it looks like finding a random simplex with all its dimensions being' + f'>= {min_val} is unlikely (it failed after {max_trials} trials)') + + +def _instantiate_ensemble(classifier, base_quantifier_class, param_grid, optim, param_model_sel, **kwargs): + if optim is None: + base_quantifier = base_quantifier_class(classifier) + elif optim in qp.error.CLASSIFICATION_ERROR: + if optim == qp.error.f1e: + scoring = make_scorer(f1_score) + elif optim == qp.error.acce: + scoring = make_scorer(accuracy_score) + classifier = GridSearchCV(classifier, param_grid, scoring=scoring) + base_quantifier = base_quantifier_class(classifier) + else: + base_quantifier = GridSearchQ(base_quantifier_class(classifier), + param_grid=param_grid, + **param_model_sel, + error=optim) + + return Ensemble(base_quantifier, **kwargs) + + +def _check_error(error): + if error is None: + return None + if error in qp.error.QUANTIFICATION_ERROR or error in qp.error.CLASSIFICATION_ERROR: + return error + elif isinstance(error, str): + return qp.error.from_name(error) + else: + raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n' + f'the name of an error function in {qp.error.ERROR_NAMES}') + + +
[docs]def ensembleFactory(classifier, base_quantifier_class, param_grid=None, optim=None, param_model_sel: dict = None, + **kwargs): + """ + Ensemble factory. Provides a unified interface for instantiating ensembles that can be optimized (via model + selection for quantification) for a given evaluation metric using :class:`quapy.model_selection.GridSearchQ`. + If the evaluation metric is classification-oriented + (instead of quantification-oriented), then the optimization will be carried out via sklearn's + `GridSearchCV <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html>`_. + + Example to instantiate an :class:`Ensemble` based on :class:`quapy.method.aggregative.PACC` + in which the base members are optimized for :meth:`quapy.error.mae` via + :class:`quapy.model_selection.GridSearchQ`. The ensemble follows the policy `Accuracy` based + on :meth:`quapy.error.mae` (the same measure being optimized), + meaning that a static selection of members of the ensemble is made based on their performance + in terms of this error. + + >>> param_grid = { + >>> 'C': np.logspace(-3,3,7), + >>> 'class_weight': ['balanced', None] + >>> } + >>> param_mod_sel = { + >>> 'sample_size': 500, + >>> 'protocol': 'app' + >>> } + >>> common={ + >>> 'max_sample_size': 1000, + >>> 'n_jobs': -1, + >>> 'param_grid': param_grid, + >>> 'param_mod_sel': param_mod_sel, + >>> } + >>> + >>> ensembleFactory(LogisticRegression(), PACC, optim='mae', policy='mae', **common) + + :param classifier: sklearn's Estimator that generates a classifier + :param base_quantifier_class: a class of quantifiers + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + if optim is not None: + if param_grid is None: + raise ValueError(f'param_grid is None but optim was requested.') + if param_model_sel is None: + raise ValueError(f'param_model_sel is None but optim was requested.') + error = _check_error(optim) + return _instantiate_ensemble(classifier, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
+ + +
[docs]def ECC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.CC` quantifiers, as used by + `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + + Equivalent to: + + >>> ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, CC, param_grid, optim, param_mod_sel, **kwargs)
+ + +
[docs]def EACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.ACC` quantifiers, as used by + `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + + Equivalent to: + + >>> ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, ACC, param_grid, optim, param_mod_sel, **kwargs)
+ + +
[docs]def EPACC(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.PACC` quantifiers. + + Equivalent to: + + >>> ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, PACC, param_grid, optim, param_mod_sel, **kwargs)
+ + +
[docs]def EHDy(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.HDy` quantifiers, as used by + `Pérez-Gállego et al., 2019 <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_. + + Equivalent to: + + >>> ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, HDy, param_grid, optim, param_mod_sel, **kwargs)
+ + +
[docs]def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): + """ + Implements an ensemble of :class:`quapy.method.aggregative.EMQ` quantifiers. + + Equivalent to: + + >>> ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs) + + See :meth:`ensembleFactory` for further details. + + :param classifier: sklearn's Estimator that generates a classifier + :param param_grid: a dictionary with the grid of parameters to optimize for + :param optim: a valid quantification or classification error, or a string name of it + :param param_model_sel: a dictionary containing any keyworded argument to pass to + :class:`quapy.model_selection.GridSearchQ` + :param kwargs: kwargs for the class :class:`Ensemble` + :return: an instance of :class:`Ensemble` + """ + + return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
+
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/method/non_aggregative.html b/docs/build/html/_modules/quapy/method/non_aggregative.html new file mode 100644 index 0000000..aeb5b96 --- /dev/null +++ b/docs/build/html/_modules/quapy/method/non_aggregative.html @@ -0,0 +1,266 @@ + + + + + + quapy.method.non_aggregative — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.method.non_aggregative

+from typing import Union, Callable
+import numpy as np
+
+from quapy.functional import get_divergence
+from quapy.data import LabelledCollection
+from quapy.method.base import BaseQuantifier, BinaryQuantifier
+import quapy.functional as F
+
+
+
[docs]class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): + """ + The `Maximum Likelihood Prevalence Estimation` (MLPE) method is a lazy method that assumes there is no prior + probability shift between training and test instances (put it other way, that the i.i.d. assumpion holds). + The estimation of class prevalence values for any test sample is always (i.e., irrespective of the test sample + itself) the class prevalence seen during training. This method is considered to be a lower-bound quantifier that + any quantification method should beat. + """ + + def __init__(self): + self._classes_ = None + +
[docs] def fit(self, data: LabelledCollection): + """ + Computes the training prevalence and stores it. + + :param data: the training sample + :return: self + """ + self.estimated_prevalence = data.prevalence() + return self
+ +
[docs] def quantify(self, instances): + """ + Ignores the input instances and returns, as the class prevalence estimantes, the training prevalence. + + :param instances: array-like (ignored) + :return: the class prevalence seen during training + """ + return self.estimated_prevalence
+ + +
[docs]class DMx(BaseQuantifier): + """ + Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates. + This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters. + + :param nbins: number of bins used to discretize the distributions (default 8) + :param divergence: a string representing a divergence measure (currently, "HD" and "topsoe" are implemented) + or a callable function taking two ndarrays of the same dimension as input (default "HD", meaning Hellinger + Distance) + :param cdf: whether to use CDF instead of PDF (default False) + :param n_jobs: number of parallel workers (default None) + """ + + def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, search='optim_minimize', n_jobs=None): + self.nbins = nbins + self.divergence = divergence + self.cdf = cdf + self.search = search + self.n_jobs = n_jobs + +
[docs] @classmethod + def HDx(cls, n_jobs=None): + """ + `Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx). + HDx is a method for training binary quantifiers, that models quantification as the problem of + minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized + histograms of two representations, one for the unlabelled examples, and another generated from the training + examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent + the estimates of the class prevalence values. + + The method computes all matchings for nbins in [10, 20, ..., 110] and reports the mean of the median. + The best prevalence is searched via linear search, from 0 to 1 stepping by 0.01. + + :param n_jobs: number of parallel workers + :return: an instance of this class setup to mimick the performance of the HDx as originally proposed by + González-Castro, Alaiz-Rodríguez, Alegre (2013) + """ + from quapy.method.meta import MedianEstimator + + dmx = DMx(divergence='HD', cdf=False, search='linear_search') + nbins = {'nbins': np.linspace(10, 110, 11, dtype=int)} + hdx = MedianEstimator(base_quantifier=dmx, param_grid=nbins, n_jobs=n_jobs) + return hdx
+ + def __get_distributions(self, X): + + histograms = [] + for feat_idx in range(self.nfeats): + feature = X[:, feat_idx] + feat_range = self.feat_ranges[feat_idx] + hist = np.histogram(feature, bins=self.nbins, range=feat_range)[0] + norm_hist = hist / hist.sum() + histograms.append(norm_hist) + distributions = np.vstack(histograms) + + if self.cdf: + distributions = np.cumsum(distributions, axis=1) + + return distributions + +
[docs] def fit(self, data: LabelledCollection): + """ + Generates the validation distributions out of the training data (covariates). + The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats` + the number of features, and `nbins` the number of bins. + In particular, let `V` be the validation distributions; then `di=V[i]` are the distributions obtained from + training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in + training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin. + + :param data: the training set + """ + X, y = data.Xy + + self.nfeats = X.shape[1] + self.feat_ranges = _get_features_range(X) + + self.validation_distribution = np.asarray( + [self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)] + ) + + return self
+ +
[docs] def quantify(self, instances): + """ + Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution + (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. + The matching is computed as the average dissimilarity (in terms of the dissimilarity measure of choice) + between all feature-specific discrete distributions. + + :param instances: instances in the sample + :return: a vector of class prevalence estimates + """ + + assert instances.shape[1] == self.nfeats, f'wrong shape; expected {self.nfeats}, found {instances.shape[1]}' + + test_distribution = self.__get_distributions(instances) + divergence = get_divergence(self.divergence) + n_classes, n_feats, nbins = self.validation_distribution.shape + def loss(prev): + prev = np.expand_dims(prev, axis=0) + mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1) + divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)] + return np.mean(divs) + + return F.argmin_prevalence(loss, n_classes, method=self.search)
+ + + +def _get_features_range(X): + feat_ranges = [] + ncols = X.shape[1] + for col_idx in range(ncols): + feature = X[:,col_idx] + feat_ranges.append((np.min(feature), np.max(feature))) + return feat_ranges + + +#--------------------------------------------------------------- +# aliases +#--------------------------------------------------------------- + +DistributionMatchingX = DMx +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/model_selection.html b/docs/build/html/_modules/quapy/model_selection.html new file mode 100644 index 0000000..172c8f3 --- /dev/null +++ b/docs/build/html/_modules/quapy/model_selection.html @@ -0,0 +1,516 @@ + + + + + + quapy.model_selection — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.model_selection

+import itertools
+import signal
+from copy import deepcopy
+from enum import Enum
+from typing import Union, Callable
+from functools import wraps
+
+import numpy as np
+from sklearn import clone
+
+import quapy as qp
+from quapy import evaluation
+from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
+from quapy.data.base import LabelledCollection
+from quapy.method.aggregative import BaseQuantifier, AggregativeQuantifier
+from quapy.util import timeout
+from time import time
+
+
+
[docs]class Status(Enum): + SUCCESS = 1 + TIMEOUT = 2 + INVALID = 3 + ERROR = 4
+ + +
[docs]class ConfigStatus: + def __init__(self, params, status, msg=''): + self.params = params + self.status = status + self.msg = msg + + def __str__(self): + return f':params:{self.params} :status:{self.status} ' + self.msg + + def __repr__(self): + return str(self) + +
[docs] def success(self): + return self.status == Status.SUCCESS
+ +
[docs] def failed(self): + return self.status != Status.SUCCESS
+ + +
[docs]class GridSearchQ(BaseQuantifier): + """Grid Search optimization targeting a quantification-oriented metric. + + Optimizes the hyperparameters of a quantification method, based on an evaluation method and on an evaluation + protocol for quantification. + + :param model: the quantifier to optimize + :type model: BaseQuantifier + :param param_grid: a dictionary with keys the parameter names and values the list of values to explore + :param protocol: a sample generation protocol, an instance of :class:`quapy.protocol.AbstractProtocol` + :param error: an error function (callable) or a string indicating the name of an error function (valid ones + are those in :class:`quapy.error.QUANTIFICATION_ERROR` + :param refit: whether to refit the model on the whole labelled collection (training+validation) with + the best chosen hyperparameter combination. Ignored if protocol='gen' + :param timeout: establishes a timer (in seconds) for each of the hyperparameters configurations being tested. + Whenever a run takes longer than this timer, that configuration will be ignored. If all configurations end up + being ignored, a TimeoutError exception is raised. If -1 (default) then no time bound is set. + :param raise_errors: boolean, if True then raises an exception when a param combination yields any error, if + otherwise is False (default), then the combination is marked with an error status, but the process goes on. + However, if no configuration yields a valid model, then a ValueError exception will be raised. + :param verbose: set to True to get information through the stdout + """ + + def __init__(self, + model: BaseQuantifier, + param_grid: dict, + protocol: AbstractProtocol, + error: Union[Callable, str] = qp.error.mae, + refit=True, + timeout=-1, + n_jobs=None, + raise_errors=False, + verbose=False): + + self.model = model + self.param_grid = param_grid + self.protocol = protocol + self.refit = refit + self.timeout = timeout + self.n_jobs = qp._get_njobs(n_jobs) + self.raise_errors = raise_errors + self.verbose = verbose + self.__check_error(error) + assert isinstance(protocol, AbstractProtocol), 'unknown protocol' + + def _sout(self, msg): + if self.verbose: + print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}') + + def __check_error(self, error): + if error in qp.error.QUANTIFICATION_ERROR: + self.error = error + elif isinstance(error, str): + self.error = qp.error.from_name(error) + elif hasattr(error, '__call__'): + self.error = error + else: + raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n' + f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}') + + def _prepare_classifier(self, cls_params): + model = deepcopy(self.model) + + def job(cls_params): + model.set_params(**cls_params) + predictions = model.classifier_fit_predict(self._training) + return predictions + + predictions, status, took = self._error_handler(job, cls_params) + self._sout(f'[classifier fit] hyperparams={cls_params} [took {took:.3f}s]') + return model, predictions, status, took + + def _prepare_aggregation(self, args): + model, predictions, cls_took, cls_params, q_params = args + model = deepcopy(model) + params = {**cls_params, **q_params} + + def job(q_params): + model.set_params(**q_params) + model.aggregation_fit(predictions, self._training) + score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) + return score + + score, status, aggr_took = self._error_handler(job, q_params) + self._print_status(params, score, status, aggr_took) + return model, params, score, status, (cls_took+aggr_took) + + def _prepare_nonaggr_model(self, params): + model = deepcopy(self.model) + + def job(params): + model.set_params(**params) + model.fit(self._training) + score = evaluation.evaluate(model, protocol=self.protocol, error_metric=self.error) + return score + + score, status, took = self._error_handler(job, params) + self._print_status(params, score, status, took) + return model, params, score, status, took + + def _break_down_fit(self): + """ + Decides whether to break down the fit phase in two (classifier-fit followed by aggregation-fit). + In order to do so, some conditions should be met: a) the quantifier is of type aggregative, + b) the set of hyperparameters can be split into two disjoint non-empty groups. + + :return: True if the conditions are met, False otherwise + """ + if not isinstance(self.model, AggregativeQuantifier): + return False + cls_configs, q_configs = group_params(self.param_grid) + if (len(cls_configs) == 1) or (len(q_configs)==1): + return False + return True + + def _compute_scores_aggregative(self, training): + # break down the set of hyperparameters into two: classifier-specific, quantifier-specific + cls_configs, q_configs = group_params(self.param_grid) + + # train all classifiers and get the predictions + self._training = training + cls_outs = qp.util.parallel( + self._prepare_classifier, + cls_configs, + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + + # filter out classifier configurations that yielded any error + success_outs = [] + for (model, predictions, status, took), cls_config in zip(cls_outs, cls_configs): + if status.success(): + success_outs.append((model, predictions, took, cls_config)) + else: + self.error_collector.append(status) + + if len(success_outs) == 0: + raise ValueError('No valid configuration found for the classifier!') + + # explore the quantifier-specific hyperparameters for each valid training configuration + aggr_configs = [(*out, q_config) for out, q_config in itertools.product(success_outs, q_configs)] + aggr_outs = qp.util.parallel( + self._prepare_aggregation, + aggr_configs, + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + + return aggr_outs + + def _compute_scores_nonaggregative(self, training): + configs = expand_grid(self.param_grid) + self._training = training + scores = qp.util.parallel( + self._prepare_nonaggr_model, + configs, + seed=qp.environ.get('_R_SEED', None), + n_jobs=self.n_jobs + ) + return scores + + def _print_status(self, params, score, status, took): + if status.success(): + self._sout(f'hyperparams=[{params}]\t got {self.error.__name__} = {score:.5f} [took {took:.3f}s]') + else: + self._sout(f'error={status}') + +
[docs] def fit(self, training: LabelledCollection): + """ Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing + the error metric. + + :param training: the training set on which to optimize the hyperparameters + :return: self + """ + + if self.refit and not isinstance(self.protocol, OnLabelledCollectionProtocol): + raise RuntimeWarning( + f'"refit" was requested, but the protocol does not implement ' + f'the {OnLabelledCollectionProtocol.__name__} interface' + ) + + tinit = time() + + self.error_collector = [] + + self._sout(f'starting model selection with n_jobs={self.n_jobs}') + if self._break_down_fit(): + results = self._compute_scores_aggregative(training) + else: + results = self._compute_scores_nonaggregative(training) + + self.param_scores_ = {} + self.best_score_ = None + for model, params, score, status, took in results: + if status.success(): + if self.best_score_ is None or score < self.best_score_: + self.best_score_ = score + self.best_params_ = params + self.best_model_ = model + self.param_scores_[str(params)] = score + else: + self.param_scores_[str(params)] = status.status + self.error_collector.append(status) + + tend = time()-tinit + + if self.best_score_ is None: + raise ValueError('no combination of hyperparameters seemed to work') + + self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) ' + f'[took {tend:.4f}s]') + + no_errors = len(self.error_collector) + if no_errors>0: + self._sout(f'warning: {no_errors} errors found') + for err in self.error_collector: + self._sout(f'\t{str(err)}') + + if self.refit: + if isinstance(self.protocol, OnLabelledCollectionProtocol): + tinit = time() + self._sout(f'refitting on the whole development set') + self.best_model_.fit(training + self.protocol.get_labelled_collection()) + tend = time() - tinit + self.refit_time_ = tend + else: + # already checked + raise RuntimeWarning(f'the model cannot be refit on the whole dataset') + + return self
+ +
[docs] def quantify(self, instances): + """Estimate class prevalence values using the best model found after calling the :meth:`fit` method. + + :param instances: sample contanining the instances + :return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found + by the model selection process. + """ + assert hasattr(self, 'best_model_'), 'quantify called before fit' + return self.best_model().quantify(instances)
+ +
[docs] def set_params(self, **parameters): + """Sets the hyper-parameters to explore. + + :param parameters: a dictionary with keys the parameter names and values the list of values to explore + """ + self.param_grid = parameters
+ +
[docs] def get_params(self, deep=True): + """Returns the dictionary of hyper-parameters to explore (`param_grid`) + + :param deep: Unused + :return: the dictionary `param_grid` + """ + return self.param_grid
+ +
[docs] def best_model(self): + """ + Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination + of hyper-parameters that minimized the error function. + + :return: a trained quantifier + """ + if hasattr(self, 'best_model_'): + return self.best_model_ + raise ValueError('best_model called before fit')
+ + def _error_handler(self, func, params): + """ + Endorses one job with two returned values: the status, and the time of execution + + :param func: the function to be called + :param params: parameters of the function + :return: `tuple(out, status, time)` where `out` is the function output, + `status` is an enum value from `Status`, and `time` is the time it + took to complete the call + """ + + output = None + + def _handle(status, exception): + if self.raise_errors: + raise exception + else: + return ConfigStatus(params, status) + + try: + with timeout(self.timeout): + tinit = time() + output = func(params) + status = ConfigStatus(params, Status.SUCCESS) + + except TimeoutError as e: + status = _handle(Status.TIMEOUT, e) + + except ValueError as e: + status = _handle(Status.INVALID, e) + + except Exception as e: + status = _handle(Status.ERROR, e) + + took = time() - tinit + return output, status, took
+ + +
[docs]def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfolds=3, random_state=0): + """ + Akin to `scikit-learn's cross_val_predict <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html>`_ + but for quantification. + + :param quantifier: a quantifier issuing class prevalence values + :param data: a labelled collection + :param nfolds: number of folds for k-fold cross validation generation + :param random_state: random seed for reproducibility + :return: a vector of class prevalence values + """ + + total_prev = np.zeros(shape=data.n_classes) + + for train, test in data.kFCV(nfolds=nfolds, random_state=random_state): + quantifier.fit(train) + fold_prev = quantifier.quantify(test.X) + rel_size = 1. * len(test) / len(data) + total_prev += fold_prev*rel_size + + return total_prev
+ + +
[docs]def expand_grid(param_grid: dict): + """ + Expands a param_grid dictionary as a list of configurations. + Example: + + >>> combinations = expand_grid({'A': [1, 10, 100], 'B': [True, False]}) + >>> print(combinations) + >>> [{'A': 1, 'B': True}, {'A': 1, 'B': False}, {'A': 10, 'B': True}, {'A': 10, 'B': False}, {'A': 100, 'B': True}, {'A': 100, 'B': False}] + + :param param_grid: dictionary with keys representing hyper-parameter names, and values representing the range + to explore for that hyper-parameter + :return: a list of configurations, i.e., combinations of hyper-parameter assignments in the grid. + """ + params_keys = list(param_grid.keys()) + params_values = list(param_grid.values()) + configs = [{k: combs[i] for i, k in enumerate(params_keys)} for combs in itertools.product(*params_values)] + return configs
+ + +
[docs]def group_params(param_grid: dict): + """ + Partitions a param_grid dictionary as two lists of configurations, one for the classifier-specific + hyper-parameters, and another for que quantifier-specific hyper-parameters + + :param param_grid: dictionary with keys representing hyper-parameter names, and values representing the range + to explore for that hyper-parameter + :return: two expanded grids of configurations, one for the classifier, another for the quantifier + """ + classifier_params, quantifier_params = {}, {} + for key, values in param_grid.items(): + if key.startswith('classifier__') or key == 'val_split': + classifier_params[key] = values + else: + quantifier_params[key] = values + + classifier_configs = expand_grid(classifier_params) + quantifier_configs = expand_grid(quantifier_params) + + return classifier_configs, quantifier_configs
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/plot.html b/docs/build/html/_modules/quapy/plot.html new file mode 100644 index 0000000..79179a1 --- /dev/null +++ b/docs/build/html/_modules/quapy/plot.html @@ -0,0 +1,687 @@ + + + + + + quapy.plot — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.plot

+from collections import defaultdict
+import matplotlib.pyplot as plt
+from matplotlib.cm import get_cmap
+import numpy as np
+from matplotlib import cm
+from scipy.stats import ttest_ind_from_stats
+from matplotlib.ticker import ScalarFormatter
+import math
+
+import quapy as qp
+
+plt.rcParams['figure.figsize'] = [10, 6]
+plt.rcParams['figure.dpi'] = 200
+plt.rcParams['font.size'] = 18
+
+
+
+[docs] +def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True, + train_prev=None, savepath=None, method_order=None): + """ + The diagonal plot displays the predicted prevalence values (along the y-axis) as a function of the true prevalence + values (along the x-axis). The optimal quantifier is described by the diagonal (0,0)-(1,1) of the plot (hence the + name). It is convenient for binary quantification problems, though it can be used for multiclass problems by + indicating which class is to be taken as the positive class. (For multiclass quantification problems, other plots + like the :meth:`error_by_drift` might be preferable though). + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param pos_class: index of the positive class + :param title: the title to be displayed in the plot + :param show_std: whether or not to show standard deviations (represented by color bands). This might be inconvenient + for cases in which many methods are compared, or when the standard deviations are high -- default True) + :param legend: whether or not to display the leyend (default True) + :param train_prev: if indicated (default is None), the training prevalence (for the positive class) is hightlighted + in the plot. This is convenient when all the experiments have been conducted in the same dataset. + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., + listed in the legend and associated with matplotlib colors). + """ + fig, ax = plt.subplots() + ax.set_aspect('equal') + ax.grid() + ax.plot([0, 1], [0, 1], '--k', label='ideal', zorder=1) + + method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs) + + order = list(zip(method_names, true_prevs, estim_prevs)) + if method_order is not None: + table = {method_name:[true_prev, estim_prev] for method_name, true_prev, estim_prev in order} + order = [(method_name, *table[method_name]) for method_name in method_order] + + NUM_COLORS = len(method_names) + if NUM_COLORS>10: + cm = plt.get_cmap('tab20') + ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) + for method, true_prev, estim_prev in order: + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + + x_ticks = np.unique(true_prev) + x_ticks.sort() + y_ave = np.asarray([estim_prev[true_prev == x].mean() for x in x_ticks]) + y_std = np.asarray([estim_prev[true_prev == x].std() for x in x_ticks]) + + ax.errorbar(x_ticks, y_ave, fmt='-', marker='o', label=method, markersize=3, zorder=2) + if show_std: + ax.fill_between(x_ticks, y_ave - y_std, y_ave + y_std, alpha=0.25) + + if train_prev is not None: + train_prev = train_prev[pos_class] + ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3) + + ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title) + ax.set_ylim(0, 1) + ax.set_xlim(0, 1) + + if legend: + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) + # box = ax.get_position() + # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + # ax.legend(loc='lower center', + # bbox_to_anchor=(1, -0.5), + # ncol=(len(method_names)+1)//2) + + _save_or_show(savepath)
+ + + +
+[docs] +def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None): + """ + Box-plots displaying the global bias (i.e., signed error computed as the estimated value minus the true value) + for each quantification method with respect to a given positive class. + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param pos_class: index of the positive class + :param title: the title to be displayed in the plot + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + """ + + method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs) + + fig, ax = plt.subplots() + ax.grid() + + data, labels = [], [] + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + data.append(estim_prev-true_prev) + labels.append(method) + + ax.boxplot(data, labels=labels, patch_artist=False, showmeans=True) + plt.xticks(rotation=45) + ax.set(ylabel='error bias', title=title) + + _save_or_show(savepath)
+ + + +
+[docs] +def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10, + vertical_xticks=False, legend=True, savepath=None): + """ + Box-plots displaying the local bias (i.e., signed error computed as the estimated value minus the true value) + for different bins of (true) prevalence of the positive classs, for each quantification method. + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param pos_class: index of the positive class + :param title: the title to be displayed in the plot + :param nbins: number of bins + :param colormap: the matplotlib colormap to use (default cm.tab10) + :param vertical_xticks: whether or not to add secondary grid (default is False) + :param legend: whether or not to display the legend (default is True) + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + """ + from pylab import boxplot, plot, setp + + fig, ax = plt.subplots() + ax.grid() + + method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs) + + bins = np.linspace(0, 1, nbins+1) + binwidth = 1/nbins + data = {} + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + true_prev = true_prev[:,pos_class] + estim_prev = estim_prev[:,pos_class] + + data[method] = [] + inds = np.digitize(true_prev, bins, right=True) + for ind in range(len(bins)): + selected = inds==ind + data[method].append(estim_prev[selected] - true_prev[selected]) + + nmethods = len(method_names) + boxwidth = binwidth/(nmethods+4) + for i,bin in enumerate(bins[:-1]): + boxdata = [data[method][i] for method in method_names] + positions = [bin+(i*boxwidth)+2*boxwidth for i,_ in enumerate(method_names)] + box = boxplot(boxdata, showmeans=False, positions=positions, widths = boxwidth, sym='+', patch_artist=True) + for boxid in range(len(method_names)): + c = colormap.colors[boxid%len(colormap.colors)] + setp(box['fliers'][boxid], color=c, marker='+', markersize=3., markeredgecolor=c) + setp(box['boxes'][boxid], color=c) + setp(box['medians'][boxid], color='k') + + major_xticks_positions, minor_xticks_positions = [], [] + major_xticks_labels, minor_xticks_labels = [], [] + for i,b in enumerate(bins[:-1]): + major_xticks_positions.append(b) + minor_xticks_positions.append(b + binwidth / 2) + major_xticks_labels.append('') + minor_xticks_labels.append(f'[{bins[i]:.2f}-{bins[i + 1]:.2f})') + ax.set_xticks(major_xticks_positions) + ax.set_xticks(minor_xticks_positions, minor=True) + ax.set_xticklabels(major_xticks_labels) + ax.set_xticklabels(minor_xticks_labels, minor=True, rotation='vertical' if vertical_xticks else 'horizontal') + + if vertical_xticks: + # Pad margins so that markers don't get clipped by the axes + plt.margins(0.2) + # Tweak spacing to prevent clipping of tick-labels + plt.subplots_adjust(bottom=0.15) + + if legend: + # adds the legend to the list hs, initialized with the "ideal" quantifier (one that has 0 bias across all bins. i.e. + # a line from (0,0) to (1,0). The other elements are simply labelled dot-plots that are to be removed (setting + # set_visible to False for all but the first element) after the legend has been placed + hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]] + for colorid in range(len(method_names)): + color=colormap.colors[colorid % len(colormap.colors)] + h, = plot([0, 0], '-s', markerfacecolor=color, color='k',mec=color, linewidth=1.) + hs.append(h) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + ax.legend(hs, ['ideal']+method_names, loc='center left', bbox_to_anchor=(1, 0.5)) + [h.set_visible(False) for h in hs[1:]] + + # x-axis and y-axis labels and limits + ax.set(xlabel='prevalence', ylabel='error bias', title=title) + ax.set_xlim(0, 1) + + _save_or_show(savepath)
+ + + +
+[docs] +def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, + n_bins=20, error_name='ae', show_std=False, + show_density=True, + show_legend=True, + logscale=False, + title=f'Quantification error as a function of distribution shift', + vlines=None, + method_order=None, + savepath=None): + """ + Plots the error (along the x-axis, as measured in terms of `error_name`) as a function of the train-test shift + (along the y-axis, as measured in terms of :meth:`quapy.error.ae`). This plot is useful especially for multiclass + problems, in which "diagonal plots" may be cumbersone, and in order to gain understanding about how methods + fare in different regions of the prior probability shift spectrum (e.g., in the low-shift regime vs. in the + high-shift regime). + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param tr_prevs: training prevalence of each experiment + :param n_bins: number of bins in which the y-axis is to be divided (default is 20) + :param error_name: a string representing the name of an error function (as defined in `quapy.error`, default is "ae") + :param show_std: whether or not to show standard deviations as color bands (default is False) + :param show_density: whether or not to display the distribution of experiments for each bin (default is True) + :param show_density: whether or not to display the legend of the chart (default is True) + :param logscale: whether or not to log-scale the y-error measure (default is False) + :param title: title of the plot (default is "Quantification error as a function of distribution shift") + :param vlines: array-like list of values (default is None). If indicated, highlights some regions of the space + using vertical dotted lines. + :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., + listed in the legend and associated with matplotlib colors). + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + """ + + fig, ax = plt.subplots() + ax.grid() + + x_error = qp.error.ae + y_error = getattr(qp.error, error_name) + + # get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same + # order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to + # x_error function) and 'y' is the estim-test shift (computed as according to y_error) + data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order) + + if method_order is None: + method_order = method_names + + _set_colors(ax, n_methods=len(method_order)) + + bins = np.linspace(0, 1, n_bins+1) + binwidth = 1 / n_bins + min_x, max_x, min_y, max_y = None, None, None, None + npoints = np.zeros(len(bins), dtype=float) + for method in method_order: + tr_test_drifts = data[method]['x'] + method_drifts = data[method]['y'] + if logscale: + ax.set_yscale("log") + ax.yaxis.set_major_formatter(ScalarFormatter()) + ax.yaxis.get_major_formatter().set_scientific(False) + ax.minorticks_off() + + inds = np.digitize(tr_test_drifts, bins, right=True) + + xs, ys, ystds = [], [], [] + for p,ind in enumerate(range(len(bins))): + selected = inds==ind + if selected.sum() > 0: + xs.append(ind*binwidth-binwidth/2) + ys.append(np.mean(method_drifts[selected])) + ystds.append(np.std(method_drifts[selected])) + npoints[p] += len(method_drifts[selected]) + + xs = np.asarray(xs) + ys = np.asarray(ys) + ystds = np.asarray(ystds) + + min_x_method, max_x_method, min_y_method, max_y_method = xs.min(), xs.max(), ys.min(), ys.max() + min_x = min_x_method if min_x is None or min_x_method < min_x else min_x + max_x = max_x_method if max_x is None or max_x_method > max_x else max_x + max_y = max_y_method if max_y is None or max_y_method > max_y else max_y + min_y = min_y_method if min_y is None or min_y_method < min_y else min_y + max_y = max_y_method if max_y is None or max_y_method > max_y else max_y + + ax.errorbar(xs, ys, fmt='-', marker='o', color='w', markersize=8, linewidth=4, zorder=1) + ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=6, linewidth=2, zorder=2) + + if show_std: + ax.fill_between(xs, ys-ystds, ys+ystds, alpha=0.25) + + if show_density: + ax2 = ax.twinx() + densities = npoints/np.sum(npoints) + ax2.bar([ind * binwidth-binwidth/2 for ind in range(len(bins))], + densities, alpha=0.15, color='g', width=binwidth, label='density') + ax2.set_ylim(0,max(densities)) + ax2.spines['right'].set_color('g') + ax2.tick_params(axis='y', colors='g') + + ax.set(xlabel=f'Distribution shift between training set and test sample', + ylabel=f'{error_name.upper()} (true distribution, predicted distribution)', + title=title) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + if vlines: + for vline in vlines: + ax.axvline(vline, 0, 1, linestyle='--', color='k') + + ax.set_xlim(min_x, max_x) + if logscale: + #nice scale for the logaritmic axis + ax.set_ylim(0,10 ** math.ceil(math.log10(max_y))) + + + if show_legend: + fig.legend(loc='lower center', + bbox_to_anchor=(1, 0.5), + ncol=(len(method_names)+1)//2) + + _save_or_show(savepath)
+ + + +
+[docs] +def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, + n_bins=20, binning='isomerous', + x_error='ae', y_error='ae', ttest_alpha=0.005, tail_density_threshold=0.005, + method_order=None, + savepath=None): + """ + Displays (only) the top performing methods for different regions of the train-test shift in form of a broken + bar chart, in which each method has bars only for those regions in which either one of the following conditions + hold: (i) it is the best method (in average) for the bin, or (ii) it is not statistically significantly different + (in average) as according to a two-sided t-test on independent samples at confidence `ttest_alpha`. + The binning can be made "isometric" (same size), or "isomerous" (same number of experiments -- default). A second + plot is displayed on top, that displays the distribution of experiments for each bin (when binning="isometric") or + the percentiles points of the distribution (when binning="isomerous"). + + :param method_names: array-like with the method names for each experiment + :param true_prevs: array-like with the true prevalence values (each being a ndarray with n_classes components) for + each experiment + :param estim_prevs: array-like with the estimated prevalence values (each being a ndarray with n_classes components) + for each experiment + :param tr_prevs: training prevalence of each experiment + :param n_bins: number of bins in which the y-axis is to be divided (default is 20) + :param binning: type of binning, either "isomerous" (default) or "isometric" + :param x_error: a string representing the name of an error function (as defined in `quapy.error`) to be used for + measuring the amount of train-test shift (default is "ae") + :param y_error: a string representing the name of an error function (as defined in `quapy.error`) to be used for + measuring the amount of error in the prevalence estimations (default is "ae") + :param ttest_alpha: the confidence interval above which a p-value (two-sided t-test on independent samples) is + to be considered as an indicator that the two means are not statistically significantly different. Default is + 0.005, meaning that a `p-value > 0.005` indicates the two methods involved are to be considered similar + :param tail_density_threshold: sets a threshold on the density of experiments (over the total number of experiments) + below which a bin in the tail (i.e., the right-most ones) will be discarded. This is in order to avoid some + bins to be shown for train-test outliers. + :param method_order: if indicated (default is None), imposes the order in which the methods are processed (i.e., + listed in the legend and associated with matplotlib colors). + :param savepath: path where to save the plot. If not indicated (as default), the plot is shown. + :return: + """ + assert binning in ['isomerous', 'isometric'], 'unknown binning type; valid types are "isomerous" and "isometric"' + + x_error = getattr(qp.error, x_error) + y_error = getattr(qp.error, y_error) + + # get all data as a dictionary {'m':{'x':ndarray, 'y':ndarray}} where 'm' is a method name (in the same + # order as in method_order (if specified), and where 'x' are the train-test shifts (computed as according to + # x_error function) and 'y' is the estim-test shift (computed as according to y_error) + data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order) + + if method_order is None: + method_order = method_names + + if binning == 'isomerous': + # take bins containing the same amount of examples + tr_test_drifts = np.concatenate([data[m]['x'] for m in method_order]) + bins = np.quantile(tr_test_drifts, q=np.linspace(0, 1, n_bins+1)).flatten() + else: + # take equidistant bins + bins = np.linspace(0, 1, n_bins+1) + bins[0] = -0.001 + bins[-1] += 0.001 + + # we use this to keep track of how many datapoits contribute to each bin + inds_histogram_global = np.zeros(n_bins, dtype=float) + n_methods = len(method_order) + buckets = np.zeros(shape=(n_methods, n_bins, 3)) + for i, method in enumerate(method_order): + tr_test_drifts = data[method]['x'] + method_drifts = data[method]['y'] + + inds = np.digitize(tr_test_drifts, bins, right=False) + inds_histogram_global += np.histogram(tr_test_drifts, density=False, bins=bins)[0] + + for j in range(len(bins)): + selected = inds == j + if selected.sum() > 0: + buckets[i, j-1, 0] = np.mean(method_drifts[selected]) + buckets[i, j-1, 1] = np.std(method_drifts[selected]) + buckets[i, j-1, 2] = selected.sum() + + # cancel last buckets with low density + histogram = inds_histogram_global / inds_histogram_global.sum() + for tail in reversed(range(len(histogram))): + if histogram[tail] < tail_density_threshold: + buckets[:,tail,2] = 0 + else: + break + + salient_methods = set() + best_methods = [] + for bucket in range(buckets.shape[1]): + nc = buckets[:, bucket, 2].sum() + if nc == 0: + best_methods.append([]) + continue + + order = np.argsort(buckets[:, bucket, 0]) + rank1 = order[0] + best_bucket_methods = [method_order[rank1]] + best_mean, best_std, best_nc = buckets[rank1, bucket, :] + for method_index in order[1:]: + method_mean, method_std, method_nc = buckets[method_index, bucket, :] + _, pval = ttest_ind_from_stats(best_mean, best_std, best_nc, method_mean, method_std, method_nc) + if pval > ttest_alpha: + best_bucket_methods.append(method_order[method_index]) + best_methods.append(best_bucket_methods) + salient_methods.update(best_bucket_methods) + print(best_bucket_methods) + + if binning=='isomerous': + fig, axes = plt.subplots(2, 1, gridspec_kw={'height_ratios': [0.2, 1]}, figsize=(20, len(salient_methods))) + else: + fig, axes = plt.subplots(2, 1, gridspec_kw={'height_ratios': [1, 1]}, figsize=(20, len(salient_methods))) + + ax = axes[1] + high_from = 0 + yticks, yticks_method_names = [], [] + color = get_cmap('Accent').colors + vlines = [] + bar_high = 1 + for method in [m for m in method_order if m in salient_methods]: + broken_paths = [] + path_start, path_end = None, None + for i, best_bucket_methods in enumerate(best_methods): + if method in best_bucket_methods: + if path_start is None: + path_start = bins[i] + path_end = bins[i+1]-path_start + else: + path_end += bins[i+1]-bins[i] + else: + if path_start is not None: + broken_paths.append(tuple((path_start, path_end))) + path_start, path_end = None, None + if path_start is not None: + broken_paths.append(tuple((path_start, path_end))) + + ax.broken_barh(broken_paths, (high_from, bar_high), facecolors=color[len(yticks_method_names)]) + yticks.append(high_from+bar_high/2) + high_from += bar_high + yticks_method_names.append(method) + for path_start, path_end in broken_paths: + vlines.extend([path_start, path_start+path_end]) + + vlines = np.unique(vlines) + vlines = sorted(vlines) + for v in vlines[1:-1]: + ax.axvline(x=v, color='k', linestyle='--') + + ax.set_ylim(0, high_from) + ax.set_xlim(vlines[0], vlines[-1]) + ax.set_xlabel('Distribution shift between training set and sample') + + ax.set_yticks(yticks) + ax.set_yticklabels(yticks_method_names) + + # upper plot (explaining distribution) + ax = axes[0] + if binning == 'isometric': + # show the density for each region + bins[0]=0 + y_pos = [b+(bins[i+1]-b)/2 for i,b in enumerate(bins[:-1]) if histogram[i]>0] + bar_width = [bins[i+1]-bins[i] for i in range(len(bins[:-1])) if histogram[i]>0] + ax.bar(y_pos, [n for n in histogram if n>0], bar_width, align='center', alpha=0.5, color='silver') + ax.set_ylabel('shift\ndistribution', rotation=0, ha='right', va='center') + ax.set_xlim(vlines[0], vlines[-1]) + ax.get_xaxis().set_visible(False) + plt.subplots_adjust(wspace=0, hspace=0.1) + else: + # show the percentiles of the distribution + cumsum = np.cumsum(histogram) + for i in range(len(bins[:-1])): + start, width = bins[i], bins[i+1]-bins[i] + ax.broken_barh([tuple((start, width))], (0, 1), facecolors='whitesmoke' if i%2==0 else 'silver') + if i < len(bins)-2: + ax.text(bins[i+1], 0.5, '$P_{'+f'{int(np.round(cumsum[i]*100))}'+'}$', ha='center') + ax.set_ylim(0, 1) + ax.set_xlim(vlines[0], vlines[-1]) + ax.get_yaxis().set_visible(False) + ax.get_xaxis().set_visible(False) + plt.subplots_adjust(wspace=0, hspace=0) + + _save_or_show(savepath)
+ + + +def _merge(method_names, true_prevs, estim_prevs): + ndims = true_prevs[0].shape[1] + data = defaultdict(lambda: {'true': np.empty(shape=(0, ndims)), 'estim': np.empty(shape=(0, ndims))}) + method_order=[] + for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): + data[method]['true'] = np.concatenate([data[method]['true'], true_prev]) + data[method]['estim'] = np.concatenate([data[method]['estim'], estim_prev]) + if method not in method_order: + method_order.append(method) + true_prevs_ = [data[m]['true'] for m in method_order] + estim_prevs_ = [data[m]['estim'] for m in method_order] + return method_order, true_prevs_, estim_prevs_ + + +def _set_colors(ax, n_methods): + NUM_COLORS = n_methods + cm = plt.get_cmap('tab20') + ax.set_prop_cycle(color=[cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) + + +def _save_or_show(savepath): + # if savepath is specified, then saves the plot in that path; otherwise the plot is shown + if savepath is not None: + qp.util.create_parent_dir(savepath) + # plt.tight_layout() + plt.savefig(savepath, bbox_inches='tight') + else: + plt.show() + + +def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order): + data = defaultdict(lambda: {'x': np.empty(shape=(0)), 'y': np.empty(shape=(0))}) + + if method_order is None: + method_order = [] + + for method, test_prevs_i, estim_prevs_i, tr_prev_i in zip(method_names, true_prevs, estim_prevs, tr_prevs): + tr_prev_i = np.repeat(tr_prev_i.reshape(1, -1), repeats=test_prevs_i.shape[0], axis=0) + + tr_test_drifts = x_error(test_prevs_i, tr_prev_i) + data[method]['x'] = np.concatenate([data[method]['x'], tr_test_drifts]) + + method_drifts = y_error(test_prevs_i, estim_prevs_i) + data[method]['y'] = np.concatenate([data[method]['y'], method_drifts]) + + if method not in method_order: + method_order.append(method) + + return data +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/protocol.html b/docs/build/html/_modules/quapy/protocol.html new file mode 100644 index 0000000..7d96338 --- /dev/null +++ b/docs/build/html/_modules/quapy/protocol.html @@ -0,0 +1,606 @@ + + + + + + quapy.protocol — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.protocol

+from copy import deepcopy
+import quapy as qp
+import numpy as np
+import itertools
+from contextlib import ExitStack
+from abc import ABCMeta, abstractmethod
+from quapy.data import LabelledCollection
+import quapy.functional as F
+from os.path import exists
+from glob import glob
+
+
+
[docs]class AbstractProtocol(metaclass=ABCMeta): + """ + Abstract parent class for sample generation protocols. + """ + + @abstractmethod + def __call__(self): + """ + Implements the protocol. Yields one sample at a time along with its prevalence + + :return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances + and in which `prev` is an `nd.array` with the class prevalence values + """ + ... + +
[docs] def total(self): + """ + Indicates the total number of samples that the protocol generates. + + :return: The number of samples to generate if known, or `None` otherwise. + """ + return None
+ + +
[docs]class IterateProtocol(AbstractProtocol): + """ + A very simple protocol which simply iterates over a list of previously generated samples + + :param samples: a list of :class:`quapy.data.base.LabelledCollection` + """ + def __init__(self, samples: [LabelledCollection]): + self.samples = samples + + def __call__(self): + """ + Yields one sample from the initial list at a time + + :return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances + and in which `prev` is an `nd.array` with the class prevalence values + """ + for sample in self.samples: + yield sample.Xp + +
[docs] def total(self): + """ + Returns the number of samples in this protocol + + :return: int + """ + return len(self.samples)
+ + +
[docs]class AbstractStochasticSeededProtocol(AbstractProtocol): + """ + An `AbstractStochasticSeededProtocol` is a protocol that generates, via any random procedure (e.g., + via random sampling), sequences of :class:`quapy.data.base.LabelledCollection` samples. + The protocol abstraction enforces + the object to be instantiated using a seed, so that the sequence can be fully replicated. + In order to make this functionality possible, the classes extending this abstraction need to + implement only two functions, :meth:`samples_parameters` which generates all the parameters + needed for extracting the samples, and :meth:`sample` that, given some parameters as input, + deterministically generates a sample. + + :param random_state: the seed for allowing to replicate any sequence of samples. Default is 0, meaning that + the sequence will be consistent every time the protocol is called. + """ + + _random_state = -1 # means "not set" + + def __init__(self, random_state=0): + self.random_state = random_state + + @property + def random_state(self): + return self._random_state + + @random_state.setter + def random_state(self, random_state): + self._random_state = random_state + +
[docs] @abstractmethod + def samples_parameters(self): + """ + This function has to return all the necessary parameters to replicate the samples + + :return: a list of parameters, each of which serves to deterministically generate a sample + """ + ...
+ +
[docs] @abstractmethod + def sample(self, params): + """ + Extract one sample determined by the given parameters + + :param params: all the necessary parameters to generate a sample + :return: one sample (the same sample has to be generated for the same parameters) + """ + ...
+ + def __call__(self): + """ + Yields one sample at a time. The type of object returned depends on the `collator` function. The + default behaviour returns tuples of the form `(sample, prevalence)`. + + :return: a tuple `(sample, prevalence)` if return_type='sample_prev', or an instance of + :class:`qp.data.LabelledCollection` if return_type='labelled_collection' + """ + with ExitStack() as stack: + if self.random_state == -1: + raise ValueError('The random seed has never been initialized. ' + 'Set it to None not to impose replicability.') + if self.random_state is not None: + stack.enter_context(qp.util.temp_seed(self.random_state)) + for params in self.samples_parameters(): + yield self.collator(self.sample(params)) + +
[docs] def collator(self, sample, *args): + """ + The collator prepares the sample to accommodate the desired output format before returning the output. + This collator simply returns the sample as it is. Classes inheriting from this abstract class can + implement their custom collators. + + :param sample: the sample to be returned + :param args: additional arguments + :return: the sample adhering to a desired output format (in this case, the sample is returned as it is) + """ + return sample
+ + +
[docs]class OnLabelledCollectionProtocol: + """ + Protocols that generate samples from a :class:`qp.data.LabelledCollection` object. + """ + + RETURN_TYPES = ['sample_prev', 'labelled_collection', 'index'] + +
[docs] def get_labelled_collection(self): + """ + Returns the labelled collection on which this protocol acts. + + :return: an object of type :class:`qp.data.LabelledCollection` + """ + return self.data
+ +
[docs] def on_preclassified_instances(self, pre_classifications, in_place=False): + """ + Returns a copy of this protocol that acts on a modified version of the original + :class:`qp.data.LabelledCollection` in which the original instances have been replaced + with the outputs of a classifier for each instance. (This is convenient for speeding-up + the evaluation procedures for many samples, by pre-classifying the instances in advance.) + + :param pre_classifications: the predictions issued by a classifier, typically an array-like + with shape `(n_instances,)` when the classifier is a hard one, or with shape + `(n_instances, n_classes)` when the classifier is a probabilistic one. + :param in_place: whether or not to apply the modification in-place or in a new copy (default). + :return: a copy of this protocol + """ + assert len(pre_classifications) == len(self.data), \ + f'error: the pre-classified data has different shape ' \ + f'(expected {len(self.data)}, found {len(pre_classifications)})' + if in_place: + self.data.instances = pre_classifications + return self + else: + new = deepcopy(self) + return new.on_preclassified_instances(pre_classifications, in_place=True)
+ +
[docs] @classmethod + def get_collator(cls, return_type='sample_prev'): + """ + Returns a collator function, i.e., a function that prepares the yielded data + + :param return_type: either 'sample_prev' (default) if the collator is requested to yield tuples of + `(sample, prevalence)`, or 'labelled_collection' when it is requested to yield instances of + :class:`qp.data.LabelledCollection` + :return: the collator function (a callable function that takes as input an instance of + :class:`qp.data.LabelledCollection`) + """ + assert return_type in cls.RETURN_TYPES, \ + f'unknown return type passed as argument; valid ones are {cls.RETURN_TYPES}' + if return_type=='sample_prev': + return lambda lc:lc.Xp + elif return_type=='labelled_collection': + return lambda lc:lc
+ + +
[docs]class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): + """ + Implementation of the artificial prevalence protocol (APP). + The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g., + [0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of + prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ..., + [1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid + combination of prevalence values is indicated by `repeats`. + + :param data: a `LabelledCollection` from which the samples will be drawn + :param sample_size: integer, number of instances in each sample; if None (default) then it is taken from + qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised. + :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the + grid (default is 21) + :param repeats: number of copies for each valid prevalence vector (default is 10) + :param smooth_limits_epsilon: the quantity to add and subtract to the limits 0 and 1 + :param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples + will be the same every time the protocol is called) + :param sanity_check: int, raises an exception warning the user that the number of examples to be generated exceed + this number; set to None for skipping this check + :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or + to "labelled_collection" to get instead instances of LabelledCollection + """ + + def __init__(self, data: LabelledCollection, sample_size=None, n_prevalences=21, repeats=10, + smooth_limits_epsilon=0, random_state=0, sanity_check=10000, return_type='sample_prev'): + super(APP, self).__init__(random_state) + self.data = data + self.sample_size = qp._get_sample_size(sample_size) + self.n_prevalences = n_prevalences + self.repeats = repeats + self.smooth_limits_epsilon = smooth_limits_epsilon + if not ((isinstance(sanity_check, int) and sanity_check>0) or sanity_check is None): + raise ValueError('param "sanity_check" must either be None or a positive integer') + if isinstance(sanity_check, int): + n = F.num_prevalence_combinations(n_prevpoints=n_prevalences, n_classes=data.n_classes, n_repeats=repeats) + if n > sanity_check: + raise RuntimeError( + f"Abort: the number of samples that will be generated by {self.__class__.__name__} ({n}) " + f"exceeds the maximum number of allowed samples ({sanity_check = }). Set 'sanity_check' to " + f"None, or to a higher number, for bypassing this check.") + + self.collator = OnLabelledCollectionProtocol.get_collator(return_type) + +
[docs] def prevalence_grid(self): + """ + Generates vectors of prevalence values from an exhaustive grid of prevalence values. The + number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example, + `n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only + valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each + valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be + implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained + to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to + 1). Note that this method is deterministic, i.e., there is no random sampling anywhere. + + :return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape + `(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found + in the grid multiplied by `repeat` + """ + dimensions = self.data.n_classes + s = F.prevalence_linspace(self.n_prevalences, repeats=1, smooth_limits_epsilon=self.smooth_limits_epsilon) + eps = (s[1]-s[0])/2 # handling floating rounding + s = [s] * (dimensions - 1) + prevs = [p for p in itertools.product(*s, repeat=1) if (sum(p) < (1.+eps))] + prevs = np.asarray(prevs).reshape(len(prevs), -1) + if self.repeats > 1: + prevs = np.repeat(prevs, self.repeats, axis=0) + return prevs
+ +
[docs] def samples_parameters(self): + """ + Return all the necessary parameters to replicate the samples as according to the APP protocol. + + :return: a list of indexes that realize the APP sampling + """ + indexes = [] + for prevs in self.prevalence_grid(): + index = self.data.sampling_index(self.sample_size, *prevs) + indexes.append(index) + return indexes
+ +
[docs] def sample(self, index): + """ + Realizes the sample given the index of the instances. + + :param index: indexes of the instances to select + :return: an instance of :class:`qp.data.LabelledCollection` + """ + return self.data.sampling_from_index(index)
+ +
[docs] def total(self): + """ + Returns the number of samples that will be generated + + :return: int + """ + return F.num_prevalence_combinations(self.n_prevalences, self.data.n_classes, self.repeats)
+ + +
[docs]class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): + """ + A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing + samples uniformly at random, therefore approximately preserving the natural prevalence of the collection. + + :param data: a `LabelledCollection` from which the samples will be drawn + :param sample_size: integer, the number of instances in each sample; if None (default) then it is taken from + qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised. + :param repeats: the number of samples to generate. Default is 100. + :param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples + will be the same every time the protocol is called) + :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or + to "labelled_collection" to get instead instances of LabelledCollection + """ + + def __init__(self, data:LabelledCollection, sample_size=None, repeats=100, random_state=0, + return_type='sample_prev'): + super(NPP, self).__init__(random_state) + self.data = data + self.sample_size = qp._get_sample_size(sample_size) + self.repeats = repeats + self.random_state = random_state + self.collator = OnLabelledCollectionProtocol.get_collator(return_type) + +
[docs] def samples_parameters(self): + """ + Return all the necessary parameters to replicate the samples as according to the NPP protocol. + + :return: a list of indexes that realize the NPP sampling + """ + indexes = [] + for _ in range(self.repeats): + index = self.data.uniform_sampling_index(self.sample_size) + indexes.append(index) + return indexes
+ +
[docs] def sample(self, index): + """ + Realizes the sample given the index of the instances. + + :param index: indexes of the instances to select + :return: an instance of :class:`qp.data.LabelledCollection` + """ + return self.data.sampling_from_index(index)
+ +
[docs] def total(self): + """ + Returns the number of samples that will be generated (equals to "repeats") + + :return: int + """ + return self.repeats
+ + +
[docs]class UPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): + """ + A variant of :class:`APP` that, instead of using a grid of equidistant prevalence values, + relies on the Kraemer algorithm for sampling unit (k-1)-simplex uniformly at random, with + k the number of classes. This protocol covers the entire range of prevalence values in a + statistical sense, i.e., unlike APP there is no guarantee that it is covered precisely + equally for all classes, but it is preferred in cases in which the number of possible + combinations of the grid values of APP makes this endeavour intractable. + + :param data: a `LabelledCollection` from which the samples will be drawn + :param sample_size: integer, the number of instances in each sample; if None (default) then it is taken from + qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised. + :param repeats: the number of samples to generate. Default is 100. + :param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples + will be the same every time the protocol is called) + :param return_type: set to "sample_prev" (default) to get the pairs of (sample, prevalence) at each iteration, or + to "labelled_collection" to get instead instances of LabelledCollection + """ + + def __init__(self, data: LabelledCollection, sample_size=None, repeats=100, random_state=0, + return_type='sample_prev'): + super(UPP, self).__init__(random_state) + self.data = data + self.sample_size = qp._get_sample_size(sample_size) + self.repeats = repeats + self.random_state = random_state + self.collator = OnLabelledCollectionProtocol.get_collator(return_type) + +
[docs] def samples_parameters(self): + """ + Return all the necessary parameters to replicate the samples as according to the UPP protocol. + + :return: a list of indexes that realize the UPP sampling + """ + indexes = [] + for prevs in F.uniform_simplex_sampling(n_classes=self.data.n_classes, size=self.repeats): + index = self.data.sampling_index(self.sample_size, *prevs) + indexes.append(index) + return indexes
+ +
[docs] def sample(self, index): + """ + Realizes the sample given the index of the instances. + + :param index: indexes of the instances to select + :return: an instance of :class:`qp.data.LabelledCollection` + """ + return self.data.sampling_from_index(index)
+ +
[docs] def total(self): + """ + Returns the number of samples that will be generated (equals to "repeats") + + :return: int + """ + return self.repeats
+ + +
[docs]class DomainMixer(AbstractStochasticSeededProtocol): + """ + Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence. + + :param domainA: one domain, an object of :class:`qp.data.LabelledCollection` + :param domainB: another domain, an object of :class:`qp.data.LabelledCollection` + :param sample_size: integer, the number of instances in each sample; if None (default) then it is taken from + qp.environ["SAMPLE_SIZE"]. If this is not set, a ValueError exception is raised. + :param repeats: int, number of samples to draw for every mixture rate + :param prevalence: the prevalence to preserv along the mixtures. If specified, should be an array containing + one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence + will be taken from the domain A (default). + :param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will + generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself. + the specific points + :param random_state: allows replicating samples across runs (default 0, meaning that the sequence of samples + will be the same every time the protocol is called) + """ + + def __init__( + self, + domainA: LabelledCollection, + domainB: LabelledCollection, + sample_size, + repeats=1, + prevalence=None, + mixture_points=11, + random_state=0, + return_type='sample_prev'): + super(DomainMixer, self).__init__(random_state) + self.A = domainA + self.B = domainB + self.sample_size = qp._get_sample_size(sample_size) + self.repeats = repeats + if prevalence is None: + self.prevalence = domainA.prevalence() + else: + self.prevalence = np.asarray(prevalence) + assert len(self.prevalence) == domainA.n_classes, \ + f'wrong shape for the vector prevalence (expected {domainA.n_classes})' + assert F.check_prevalence_vector(self.prevalence), \ + f'the prevalence vector is not valid (either it contains values outside [0,1] or does not sum up to 1)' + if isinstance(mixture_points, int): + self.mixture_points = np.linspace(0, 1, mixture_points)[::-1] + else: + self.mixture_points = np.asarray(mixture_points) + assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \ + 'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])' + self.random_state = random_state + self.collator = OnLabelledCollectionProtocol.get_collator(return_type) + +
[docs] def samples_parameters(self): + """ + Return all the necessary parameters to replicate the samples as according to the this protocol. + + :return: a list of zipped indexes (from A and B) that realize the sampling + """ + indexesA, indexesB = [], [] + for propA in self.mixture_points: + for _ in range(self.repeats): + nA = int(np.round(self.sample_size * propA)) + nB = self.sample_size-nA + sampleAidx = self.A.sampling_index(nA, *self.prevalence) + sampleBidx = self.B.sampling_index(nB, *self.prevalence) + indexesA.append(sampleAidx) + indexesB.append(sampleBidx) + return list(zip(indexesA, indexesB))
+ +
[docs] def sample(self, indexes): + """ + Realizes the sample given a pair of indexes of the instances from A and B. + + :param indexes: indexes of the instances to select from A and B + :return: an instance of :class:`qp.data.LabelledCollection` + """ + indexesA, indexesB = indexes + sampleA = self.A.sampling_from_index(indexesA) + sampleB = self.B.sampling_from_index(indexesB) + return sampleA+sampleB
+ +
[docs] def total(self): + """ + Returns the number of samples that will be generated (equals to "repeats * mixture_points") + + :return: int + """ + return self.repeats * len(self.mixture_points)
+ + +# aliases + +ArtificialPrevalenceProtocol = APP +NaturalPrevalenceProtocol = NPP +UniformPrevalenceProtocol = UPP +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_base.html b/docs/build/html/_modules/quapy/tests/test_base.html new file mode 100644 index 0000000..baf8cfa --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_base.html @@ -0,0 +1,110 @@ + + + + + + quapy.tests.test_base — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_base

+import pytest
+
+
+[docs] +def test_import(): + import quapy as qp + assert qp.__version__ is not None
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_datasets.html b/docs/build/html/_modules/quapy/tests/test_datasets.html new file mode 100644 index 0000000..785e535 --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_datasets.html @@ -0,0 +1,178 @@ + + + + + + quapy.tests.test_datasets — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_datasets

+import pytest
+
+from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
+    TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_BINARY_DATASETS, LEQUA2022_TASKS, UCI_MULTICLASS_DATASETS,\
+    fetch_reviews, fetch_twitter, fetch_UCIBinaryDataset, fetch_lequa2022, fetch_UCIMulticlassLabelledCollection
+
+
+
+[docs] +@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS) +def test_fetch_reviews(dataset_name): + dataset = fetch_reviews(dataset_name) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats') + dataset.test.stats()
+ + + +
+[docs] +@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN) +def test_fetch_twitter(dataset_name): + try: + dataset = fetch_twitter(dataset_name) + except ValueError as ve: + if dataset_name == 'semeval' and ve.args[0].startswith( + 'dataset "semeval" can only be used for model selection.'): + dataset = fetch_twitter(dataset_name, for_model_selection=True) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats')
+ + + +
+[docs] +@pytest.mark.parametrize('dataset_name', UCI_BINARY_DATASETS) +def test_fetch_UCIDataset(dataset_name): + try: + dataset = fetch_UCIBinaryDataset(dataset_name) + except FileNotFoundError as fnfe: + if dataset_name == 'pageblocks.5' and fnfe.args[0].find( + 'If this is the first time you attempt to load this dataset') > 0: + print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.') + return + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.training.stats() + print('Test set stats')
+ + + +
+[docs] +@pytest.mark.parametrize('dataset_name', UCI_MULTICLASS_DATASETS) +def test_fetch_UCIMultiDataset(dataset_name): + dataset = fetch_UCIMulticlassLabelledCollection(dataset_name) + print(f'Dataset {dataset_name}') + print('Training set stats') + dataset.stats() + print('Test set stats')
+ + + +
+[docs] +@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS) +def test_fetch_lequa2022(dataset_name): + train, gen_val, gen_test = fetch_lequa2022(dataset_name) + print(train.stats()) + print('Val:', gen_val.total()) + print('Test:', gen_test.total())
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_evaluation.html b/docs/build/html/_modules/quapy/tests/test_evaluation.html new file mode 100644 index 0000000..d5603a4 --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_evaluation.html @@ -0,0 +1,195 @@ + + + + + + quapy.tests.test_evaluation — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_evaluation

+import unittest
+
+import numpy as np
+
+import quapy as qp
+from sklearn.linear_model import LogisticRegression
+from time import time
+
+from quapy.error import QUANTIFICATION_ERROR_SINGLE, QUANTIFICATION_ERROR, QUANTIFICATION_ERROR_NAMES, \
+    QUANTIFICATION_ERROR_SINGLE_NAMES
+from quapy.method.aggregative import EMQ, PCC
+from quapy.method.base import BaseQuantifier
+
+
+
+[docs] +class EvalTestCase(unittest.TestCase): +
+[docs] + def test_eval_speedup(self): + + data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) + train, test = data.training, data.test + + protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1) + + class SlowLR(LogisticRegression): + def predict_proba(self, X): + import time + time.sleep(1) + return super().predict_proba(X) + + emq = EMQ(SlowLR()).fit(train) + + tinit = time() + score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force') + tend_optim = time()-tinit + print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]') + + class NonAggregativeEMQ(BaseQuantifier): + + def __init__(self, cls): + self.emq = EMQ(cls) + + def quantify(self, instances): + return self.emq.quantify(instances) + + def fit(self, data): + self.emq.fit(data) + return self + + emq = NonAggregativeEMQ(SlowLR()).fit(train) + + tinit = time() + score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True) + tend_no_optim = time() - tinit + print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]') + + self.assertEqual(tend_no_optim>(tend_optim/2), True)
+ + +
+[docs] + def test_evaluation_output(self): + + data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) + train, test = data.training, data.test + + qp.environ['SAMPLE_SIZE']=100 + + protocol = qp.protocol.APP(test, random_state=0) + + q = PCC(LogisticRegression()).fit(train) + + single_errors = list(QUANTIFICATION_ERROR_SINGLE_NAMES) + averaged_errors = ['m'+e for e in single_errors] + single_errors = single_errors + [qp.error.from_name(e) for e in single_errors] + averaged_errors = averaged_errors + [qp.error.from_name(e) for e in averaged_errors] + for error_metric, averaged_error_metric in zip(single_errors, averaged_errors): + score = qp.evaluation.evaluate(q, protocol, error_metric=averaged_error_metric) + self.assertTrue(isinstance(score, float)) + + scores = qp.evaluation.evaluate(q, protocol, error_metric=error_metric) + self.assertTrue(isinstance(scores, np.ndarray)) + + self.assertEqual(scores.mean(), score)
+
+ + + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_hierarchy.html b/docs/build/html/_modules/quapy/tests/test_hierarchy.html new file mode 100644 index 0000000..793091b --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_hierarchy.html @@ -0,0 +1,143 @@ + + + + + + quapy.tests.test_hierarchy — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_hierarchy

+import unittest
+from sklearn.linear_model import LogisticRegression
+from quapy.method.aggregative import *
+
+
+
+[docs] +class HierarchyTestCase(unittest.TestCase): + +
+[docs] + def test_aggregative(self): + lr = LogisticRegression() + for m in [CC(lr), PCC(lr), ACC(lr), PACC(lr)]: + self.assertEqual(isinstance(m, AggregativeQuantifier), True)
+ + +
+[docs] + def test_binary(self): + lr = LogisticRegression() + for m in [HDy(lr)]: + self.assertEqual(isinstance(m, BinaryQuantifier), True)
+ + +
+[docs] + def test_probabilistic(self): + lr = LogisticRegression() + for m in [CC(lr), ACC(lr)]: + self.assertEqual(isinstance(m, AggregativeCrispQuantifier), True) + self.assertEqual(isinstance(m, AggregativeSoftQuantifier), False) + for m in [PCC(lr), PACC(lr)]: + self.assertEqual(isinstance(m, AggregativeCrispQuantifier), False) + self.assertEqual(isinstance(m, AggregativeSoftQuantifier), True)
+
+ + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_labelcollection.html b/docs/build/html/_modules/quapy/tests/test_labelcollection.html new file mode 100644 index 0000000..682aeba --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_labelcollection.html @@ -0,0 +1,176 @@ + + + + + + quapy.tests.test_labelcollection — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for quapy.tests.test_labelcollection

+import unittest
+import numpy as np
+from scipy.sparse import csr_matrix
+
+import quapy as qp
+
+
+
+[docs] +class LabelCollectionTestCase(unittest.TestCase): +
+[docs] + def test_split(self): + x = np.arange(100) + y = np.random.randint(0,5,100) + data = qp.data.LabelledCollection(x,y) + tr, te = data.split_random(0.7) + check_prev = tr.prevalence()*0.7 + te.prevalence()*0.3 + + self.assertEqual(len(tr), 70) + self.assertEqual(len(te), 30) + self.assertEqual(np.allclose(check_prev, data.prevalence()), True) + self.assertEqual(len(tr+te), len(data))
+ + +
+[docs] + def test_join(self): + x = np.arange(50) + y = np.random.randint(2, 5, 50) + data1 = qp.data.LabelledCollection(x, y) + + x = np.arange(200) + y = np.random.randint(0, 3, 200) + data2 = qp.data.LabelledCollection(x, y) + + x = np.arange(100) + y = np.random.randint(0, 6, 100) + data3 = qp.data.LabelledCollection(x, y) + + combined = qp.data.LabelledCollection.join(data1, data2, data3) + self.assertEqual(len(combined), len(data1)+len(data2)+len(data3)) + self.assertEqual(all(combined.classes_ == np.arange(6)), True) + + x = np.random.rand(10, 3) + y = np.random.randint(0, 1, 10) + data4 = qp.data.LabelledCollection(x, y) + with self.assertRaises(Exception): + combined = qp.data.LabelledCollection.join(data1, data2, data3, data4) + + x = np.random.rand(20, 3) + y = np.random.randint(0, 1, 20) + data5 = qp.data.LabelledCollection(x, y) + combined = qp.data.LabelledCollection.join(data4, data5) + self.assertEqual(len(combined), len(data4)+len(data5)) + + x = np.random.rand(10, 4) + y = np.random.randint(0, 1, 10) + data6 = qp.data.LabelledCollection(x, y) + with self.assertRaises(Exception): + combined = qp.data.LabelledCollection.join(data4, data5, data6) + + data4.instances = csr_matrix(data4.instances) + with self.assertRaises(Exception): + combined = qp.data.LabelledCollection.join(data4, data5) + data5.instances = csr_matrix(data5.instances) + combined = qp.data.LabelledCollection.join(data4, data5) + self.assertEqual(len(combined), len(data4) + len(data5))
+
+ + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_methods.html b/docs/build/html/_modules/quapy/tests/test_methods.html new file mode 100644 index 0000000..e2b28a9 --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_methods.html @@ -0,0 +1,357 @@ + + + + + + quapy.tests.test_methods — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_methods

+import numpy as np
+import pytest
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import LinearSVC
+
+import method.aggregative
+import quapy as qp
+from quapy.model_selection import GridSearchQ
+from quapy.method.base import BinaryQuantifier
+from quapy.data import Dataset, LabelledCollection
+from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
+from quapy.method.meta import Ensemble
+from quapy.protocol import APP
+from quapy.method.aggregative import DMy
+from quapy.method.meta import MedianEstimator
+
+# datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
+#             pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
+
+tinydatasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True).reduce(), id='tiny_hcr'),
+                pytest.param(qp.datasets.fetch_UCIBinaryDataset('ionosphere').reduce(), id='tiny_ionosphere')]
+
+learners = [LogisticRegression, LinearSVC]
+
+
+
+[docs] +@pytest.mark.parametrize('dataset', tinydatasets) +@pytest.mark.parametrize('aggregative_method', AGGREGATIVE_METHODS) +@pytest.mark.parametrize('learner', learners) +def test_aggregative_methods(dataset: Dataset, aggregative_method, learner): + model = aggregative_method(learner()) + + if isinstance(model, BinaryQuantifier) and not dataset.binary: + print(f'skipping the test of binary model {type(model)} on non-binary dataset {dataset}') + return + + model.fit(dataset.training) + + estim_prevalences = model.quantify(dataset.test.instances) + + true_prevalences = dataset.test.prevalence() + error = qp.error.mae(true_prevalences, estim_prevalences) + + assert type(error) == np.float64
+ + + +
+[docs] +@pytest.mark.parametrize('dataset', tinydatasets) +@pytest.mark.parametrize('non_aggregative_method', NON_AGGREGATIVE_METHODS) +def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method): + model = non_aggregative_method() + + if isinstance(model, BinaryQuantifier) and not dataset.binary: + print(f'skipping the test of binary model {model} on non-binary dataset {dataset}') + return + + model.fit(dataset.training) + + estim_prevalences = model.quantify(dataset.test.instances) + + true_prevalences = dataset.test.prevalence() + error = qp.error.mae(true_prevalences, estim_prevalences) + + assert type(error) == np.float64
+ + + +
+[docs] +@pytest.mark.parametrize('base_method', [method.aggregative.ACC, method.aggregative.PACC]) +@pytest.mark.parametrize('learner', [LogisticRegression]) +@pytest.mark.parametrize('dataset', tinydatasets) +@pytest.mark.parametrize('policy', Ensemble.VALID_POLICIES) +def test_ensemble_method(base_method, learner, dataset: Dataset, policy): + + qp.environ['SAMPLE_SIZE'] = 20 + + base_quantifier=base_method(learner()) + + if not dataset.binary and policy=='ds': + print(f'skipping the test of binary policy ds on non-binary dataset {dataset}') + return + + model = Ensemble(quantifier=base_quantifier, size=3, policy=policy, n_jobs=-1) + + model.fit(dataset.training) + + estim_prevalences = model.quantify(dataset.test.instances) + + true_prevalences = dataset.test.prevalence() + error = qp.error.mae(true_prevalences, estim_prevalences) + + assert type(error) == np.float64
+ + + +
+[docs] +def test_quanet_method(): + try: + import quapy.classification.neural + except ModuleNotFoundError: + print('skipping QuaNet test due to missing torch package') + return + + qp.environ['SAMPLE_SIZE'] = 100 + + # load the kindle dataset as text, and convert words to numerical indexes + dataset = qp.datasets.fetch_reviews('kindle', pickle=True).reduce(200, 200) + qp.data.preprocessing.index(dataset, min_df=5, inplace=True) + + from quapy.classification.neural import CNNnet + cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) + + from quapy.classification.neural import NeuralClassifierTrainer + learner = NeuralClassifierTrainer(cnn, device='cuda') + + from quapy.method.meta import QuaNet + model = QuaNet(learner, device='cuda') + + if isinstance(model, BinaryQuantifier) and not dataset.binary: + print(f'skipping the test of binary model {model} on non-binary dataset {dataset}') + return + + model.fit(dataset.training) + + estim_prevalences = model.quantify(dataset.test.instances) + + true_prevalences = dataset.test.prevalence() + error = qp.error.mae(true_prevalences, estim_prevalences) + + assert type(error) == np.float64
+ + + +
+[docs] +def test_str_label_names(): + model = qp.method.aggregative.CC(LogisticRegression()) + + dataset = qp.datasets.fetch_reviews('imdb', pickle=True) + dataset = Dataset(dataset.training.sampling(1000, *dataset.training.prevalence()), + dataset.test.sampling(1000, 0.25, 0.75)) + qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) + + np.random.seed(0) + model.fit(dataset.training) + + int_estim_prevalences = model.quantify(dataset.test.instances) + true_prevalences = dataset.test.prevalence() + + error = qp.error.mae(true_prevalences, int_estim_prevalences) + assert type(error) == np.float64 + + dataset_str = Dataset(LabelledCollection(dataset.training.instances, + ['one' if label == 1 else 'zero' for label in dataset.training.labels]), + LabelledCollection(dataset.test.instances, + ['one' if label == 1 else 'zero' for label in dataset.test.labels])) + assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation' + np.random.seed(0) + model.fit(dataset_str.training) + + str_estim_prevalences = model.quantify(dataset_str.test.instances) + true_prevalences = dataset_str.test.prevalence() + + error = qp.error.mae(true_prevalences, str_estim_prevalences) + assert type(error) == np.float64 + + print(true_prevalences) + print(int_estim_prevalences) + print(str_estim_prevalences) + + np.testing.assert_almost_equal(int_estim_prevalences[1], + str_estim_prevalences[list(model.classes_).index('one')])
+ + +# helper +def __fit_test(quantifier, train, test): + quantifier.fit(train) + test_samples = APP(test) + true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples) + return qp.error.mae(true_prevs, estim_prevs), estim_prevs + + +
+[docs] +def test_median_meta(): + """ + This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions + of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is + computed across different values of nbins + """ + + qp.environ['SAMPLE_SIZE'] = 100 + + # grid of values + nbins_grid = list(range(2, 11)) + + dataset = 'kindle' + train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test + prevs = [] + errors = [] + for nbins in nbins_grid: + with qp.util.temp_seed(0): + q = DMy(LogisticRegression(), nbins=nbins) + mae, estim_prevs = __fit_test(q, train, test) + prevs.append(estim_prevs) + errors.append(mae) + print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}') + prevs = np.asarray(prevs) + mae = np.mean(errors) + print(f'\tMAE={mae:.4f}') + + q = DMy(LogisticRegression()) + q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) + median_mae, prev = __fit_test(q, train, test) + print(f'\tMAE={median_mae:.4f}') + + np.testing.assert_almost_equal(np.median(prevs, axis=0), prev) + assert median_mae < mae, 'the median-based quantifier provided a higher error...'
+ + + +
+[docs] +def test_median_meta_modsel(): + """ + This test checks the median-meta quantifier with model selection + """ + + qp.environ['SAMPLE_SIZE'] = 100 + + dataset = 'kindle' + train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test + train, val = train.split_stratified(random_state=0) + + nbins_grid = [2, 4, 5, 10, 15] + + q = DMy(LogisticRegression()) + q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) + median_mae, _ = __fit_test(q, train, test) + print(f'\tMAE={median_mae:.4f}') + + q = DMy(LogisticRegression()) + lr_params = {'classifier__C': np.logspace(-1, 1, 3)} + q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1) + q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1) + optimized_median_ave, _ = __fit_test(q, train, test) + print(f'\tMAE={optimized_median_ave:.4f}') + + assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..."
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_modsel.html b/docs/build/html/_modules/quapy/tests/test_modsel.html new file mode 100644 index 0000000..ff1c51c --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_modsel.html @@ -0,0 +1,225 @@ + + + + + + quapy.tests.test_modsel — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_modsel

+import unittest
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+
+import quapy as qp
+from quapy.method.aggregative import PACC
+from quapy.model_selection import GridSearchQ
+from quapy.protocol import APP
+import time
+
+
+
+[docs] +class ModselTestCase(unittest.TestCase): + +
+[docs] + def test_modsel(self): + + q = PACC(LogisticRegression(random_state=1, max_iter=5000)) + + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + training, validation = data.training.split_stratified(0.7, random_state=1) + + param_grid = {'classifier__C': np.logspace(-3,3,7)} + app = APP(validation, sample_size=100, random_state=1) + q = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True + ).fit(training) + print('best params', q.best_params_) + print('best score', q.best_score_) + + self.assertEqual(q.best_params_['classifier__C'], 10.0) + self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0)
+ + +
+[docs] + def test_modsel_parallel(self): + + q = PACC(LogisticRegression(random_state=1, max_iter=5000)) + + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + training, validation = data.training.split_stratified(0.7, random_state=1) + # test = data.test + + param_grid = {'classifier__C': np.logspace(-3,3,7)} + app = APP(validation, sample_size=100, random_state=1) + q = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True + ).fit(training) + print('best params', q.best_params_) + print('best score', q.best_score_) + + self.assertEqual(q.best_params_['classifier__C'], 10.0) + self.assertEqual(q.best_model().get_params()['classifier__C'], 10.0)
+ + +
+[docs] + def test_modsel_parallel_speedup(self): + class SlowLR(LogisticRegression): + def fit(self, X, y, sample_weight=None): + time.sleep(1) + return super(SlowLR, self).fit(X, y, sample_weight) + + q = PACC(SlowLR(random_state=1, max_iter=5000)) + + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + training, validation = data.training.split_stratified(0.7, random_state=1) + + param_grid = {'classifier__C': np.logspace(-3, 3, 7)} + app = APP(validation, sample_size=100, random_state=1) + + tinit = time.time() + GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True + ).fit(training) + tend_nooptim = time.time()-tinit + + tinit = time.time() + GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True + ).fit(training) + tend_optim = time.time() - tinit + + print(f'parallel training took {tend_optim:.4f}s') + print(f'sequential training took {tend_nooptim:.4f}s') + + self.assertEqual(tend_optim < (0.5*tend_nooptim), True)
+ + +
+[docs] + def test_modsel_timeout(self): + + class SlowLR(LogisticRegression): + def fit(self, X, y, sample_weight=None): + import time + time.sleep(10) + super(SlowLR, self).fit(X, y, sample_weight) + + q = PACC(SlowLR()) + + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) + training, validation = data.training.split_stratified(0.7, random_state=1) + # test = data.test + + param_grid = {'classifier__C': np.logspace(-3,3,7)} + app = APP(validation, sample_size=100, random_state=1) + q = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True + ) + with self.assertRaises(TimeoutError): + q.fit(training)
+
+ + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_protocols.html b/docs/build/html/_modules/quapy/tests/test_protocols.html new file mode 100644 index 0000000..65e6d83 --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_protocols.html @@ -0,0 +1,336 @@ + + + + + + quapy.tests.test_protocols — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.tests.test_protocols

+import unittest
+import numpy as np
+
+import quapy.functional
+from quapy.data import LabelledCollection
+from quapy.protocol import APP, NPP, UPP, DomainMixer, AbstractStochasticSeededProtocol
+
+
+
+[docs] +def mock_labelled_collection(prefix=''): + y = [0] * 250 + [1] * 250 + [2] * 250 + [3] * 250 + X = [prefix + str(i) + '-' + str(yi) for i, yi in enumerate(y)] + return LabelledCollection(X, y, classes=sorted(np.unique(y)))
+ + + +
+[docs] +def samples_to_str(protocol): + samples_str = "" + for instances, prev in protocol(): + samples_str += f'{instances}\t{prev}\n' + return samples_str
+ + + +
+[docs] +class TestProtocols(unittest.TestCase): + +
+[docs] + def test_app_sanity_check(self): + data = mock_labelled_collection() + n_prevpoints = 101 + repeats = 10 + with self.assertRaises(RuntimeError): + p = APP(data, sample_size=5, n_prevalences=n_prevpoints, repeats=repeats, random_state=42) + n_combinations = \ + quapy.functional.num_prevalence_combinations(n_prevpoints, n_classes=data.n_classes, n_repeats=repeats) + p = APP(data, sample_size=5, n_prevalences=n_prevpoints, random_state=42, sanity_check=n_combinations) + p = APP(data, sample_size=5, n_prevalences=n_prevpoints, random_state=42, sanity_check=None)
+ + +
+[docs] + def test_app_replicate(self): + data = mock_labelled_collection() + p = APP(data, sample_size=5, n_prevalences=11, random_state=42) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2) + + p = APP(data, sample_size=5, n_prevalences=11) # <- random_state is by default set to 0 + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2)
+ + +
+[docs] + def test_app_not_replicate(self): + data = mock_labelled_collection() + p = APP(data, sample_size=5, n_prevalences=11, random_state=None) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2) + + p = APP(data, sample_size=5, n_prevalences=11, random_state=42) + samples1 = samples_to_str(p) + p = APP(data, sample_size=5, n_prevalences=11, random_state=0) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2)
+ + +
+[docs] + def test_app_number(self): + data = mock_labelled_collection() + p = APP(data, sample_size=100, n_prevalences=10, repeats=1) + + # surprisingly enough, for some n_prevalences the test fails, notwithstanding + # everything is correct. The problem is that in function APP.prevalence_grid() + # there is sometimes one rounding error that gets cumulated and + # surpasses 1.0 (by a very small float value, 0.0000000000002 or sthe like) + # so these tuples are mistakenly removed... I have tried with np.close, and + # other workarounds, but eventually happens that there is some negative probability + # in the sampling function... + + count = 0 + for _ in p(): + count+=1 + + self.assertEqual(count, p.total())
+ + +
+[docs] + def test_npp_replicate(self): + data = mock_labelled_collection() + p = NPP(data, sample_size=5, repeats=5, random_state=42) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2) + + p = NPP(data, sample_size=5, repeats=5) # <- random_state is by default set to 0 + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2)
+ + +
+[docs] + def test_npp_not_replicate(self): + data = mock_labelled_collection() + p = NPP(data, sample_size=5, repeats=5, random_state=None) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2) + + p = NPP(data, sample_size=5, repeats=5, random_state=42) + samples1 = samples_to_str(p) + p = NPP(data, sample_size=5, repeats=5, random_state=0) + samples2 = samples_to_str(p) + self.assertNotEqual(samples1, samples2)
+ + +
+[docs] + def test_kraemer_replicate(self): + data = mock_labelled_collection() + p = UPP(data, sample_size=5, repeats=10, random_state=42) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2) + + p = UPP(data, sample_size=5, repeats=10) # <- random_state is by default set to 0 + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2)
+ + +
+[docs] + def test_kraemer_not_replicate(self): + data = mock_labelled_collection() + p = UPP(data, sample_size=5, repeats=10, random_state=None) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2)
+ + +
+[docs] + def test_covariate_shift_replicate(self): + dataA = mock_labelled_collection('domA') + dataB = mock_labelled_collection('domB') + p = DomainMixer(dataA, dataB, sample_size=10, mixture_points=11, random_state=1) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2) + + p = DomainMixer(dataA, dataB, sample_size=10, mixture_points=11) # <- random_state is by default set to 0 + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertEqual(samples1, samples2)
+ + +
+[docs] + def test_covariate_shift_not_replicate(self): + dataA = mock_labelled_collection('domA') + dataB = mock_labelled_collection('domB') + p = DomainMixer(dataA, dataB, sample_size=10, mixture_points=11, random_state=None) + + samples1 = samples_to_str(p) + samples2 = samples_to_str(p) + + self.assertNotEqual(samples1, samples2)
+ + +
+[docs] + def test_no_seed_init(self): + class NoSeedInit(AbstractStochasticSeededProtocol): + def __init__(self): + self.data = mock_labelled_collection() + + def samples_parameters(self): + # return a matrix containing sampling indexes in the rows + return np.random.randint(0, len(self.data), 10*10).reshape(10, 10) + + def sample(self, params): + index = np.unique(params) + return self.data.sampling_from_index(index) + + p = NoSeedInit() + + # this should raise a ValueError, since the class is said to be AbstractStochasticSeededProtocol but the + # random_seed has never been passed to super(NoSeedInit, self).__init__(random_seed) + with self.assertRaises(ValueError): + for sample in p(): + pass + print('done')
+
+ + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/tests/test_replicability.html b/docs/build/html/_modules/quapy/tests/test_replicability.html new file mode 100644 index 0000000..4731cce --- /dev/null +++ b/docs/build/html/_modules/quapy/tests/test_replicability.html @@ -0,0 +1,225 @@ + + + + + + quapy.tests.test_replicability — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Source code for quapy.tests.test_replicability

+import unittest
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.functional import strprev
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+from quapy.method.aggregative import PACC
+import quapy.functional as F
+
+
+
+[docs] +class MyTestCase(unittest.TestCase): + +
+[docs] + def test_prediction_replicability(self): + + dataset = qp.datasets.fetch_UCIBinaryDataset('yeast') + + with qp.util.temp_seed(0): + lr = LogisticRegression(random_state=0, max_iter=10000) + pacc = PACC(lr) + prev = pacc.fit(dataset.training).quantify(dataset.test.X) + str_prev1 = strprev(prev, prec=5) + + with qp.util.temp_seed(0): + lr = LogisticRegression(random_state=0, max_iter=10000) + pacc = PACC(lr) + prev2 = pacc.fit(dataset.training).quantify(dataset.test.X) + str_prev2 = strprev(prev2, prec=5) + + self.assertEqual(str_prev1, str_prev2) # add assertion here
+ + + +
+[docs] + def test_samping_replicability(self): + + def equal_collections(c1, c2, value=True): + self.assertEqual(np.all(c1.Xtr == c2.Xtr), value) + self.assertEqual(np.all(c1.ytr == c2.ytr), value) + if value: + self.assertEqual(np.all(c1.classes_ == c2.classes_), value) + + X = list(map(str, range(100))) + y = np.random.randint(0, 2, 100) + data = LabelledCollection(instances=X, labels=y) + + sample1 = data.sampling(50) + sample2 = data.sampling(50) + equal_collections(sample1, sample2, False) + + sample1 = data.sampling(50, random_state=0) + sample2 = data.sampling(50, random_state=0) + equal_collections(sample1, sample2, True) + + sample1 = data.sampling(50, *[0.7, 0.3], random_state=0) + sample2 = data.sampling(50, *[0.7, 0.3], random_state=0) + equal_collections(sample1, sample2, True) + + with qp.util.temp_seed(0): + sample1 = data.sampling(50, *[0.7, 0.3]) + with qp.util.temp_seed(0): + sample2 = data.sampling(50, *[0.7, 0.3]) + equal_collections(sample1, sample2, True) + + sample1 = data.sampling(50, *[0.7, 0.3], random_state=0) + sample2 = data.sampling(50, *[0.7, 0.3], random_state=0) + equal_collections(sample1, sample2, True) + + sample1_tr, sample1_te = data.split_stratified(train_prop=0.7, random_state=0) + sample2_tr, sample2_te = data.split_stratified(train_prop=0.7, random_state=0) + equal_collections(sample1_tr, sample2_tr, True) + equal_collections(sample1_te, sample2_te, True) + + with qp.util.temp_seed(0): + sample1_tr, sample1_te = data.split_stratified(train_prop=0.7) + with qp.util.temp_seed(0): + sample2_tr, sample2_te = data.split_stratified(train_prop=0.7) + equal_collections(sample1_tr, sample2_tr, True) + equal_collections(sample1_te, sample2_te, True)
+ + + +
+[docs] + def test_parallel_replicability(self): + + train, test = qp.datasets.fetch_UCIMulticlassDataset('dry-bean').train_test + + test = test.sampling(500, *[0.1, 0.0, 0.1, 0.1, 0.2, 0.5, 0.0]) + + with qp.util.temp_seed(10): + pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2) + pacc.fit(train, val_split=0.5) + prev1 = F.strprev(pacc.quantify(test.instances)) + + with qp.util.temp_seed(0): + pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2) + pacc.fit(train, val_split=0.5) + prev2 = F.strprev(pacc.quantify(test.instances)) + + with qp.util.temp_seed(0): + pacc = PACC(LogisticRegression(), val_split=2, n_jobs=2) + pacc.fit(train, val_split=0.5) + prev3 = F.strprev(pacc.quantify(test.instances)) + + print(prev1) + print(prev2) + print(prev3) + + self.assertNotEqual(prev1, prev2) + self.assertEqual(prev2, prev3)
+
+ + + + + +if __name__ == '__main__': + unittest.main() +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_modules/quapy/util.html b/docs/build/html/_modules/quapy/util.html new file mode 100644 index 0000000..25532bd --- /dev/null +++ b/docs/build/html/_modules/quapy/util.html @@ -0,0 +1,402 @@ + + + + + + quapy.util — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for quapy.util

+import contextlib
+import itertools
+import multiprocessing
+import os
+import pickle
+import urllib
+from pathlib import Path
+from contextlib import ExitStack
+import quapy as qp
+
+import numpy as np
+from joblib import Parallel, delayed
+from time import time
+import signal
+
+
+def _get_parallel_slices(n_tasks, n_jobs):
+    if n_jobs == -1:
+        n_jobs = multiprocessing.cpu_count()
+    batch = int(n_tasks / n_jobs)
+    remainder = n_tasks % n_jobs
+    return [slice(job * batch, (job + 1) * batch + (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
+
+
+
[docs]def map_parallel(func, args, n_jobs): + """ + Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then + func is applied in two parallel processes to args[0:50] and to args[50:99]. func is a function + that already works with a list of arguments. + + :param func: function to be parallelized + :param args: array-like of arguments to be passed to the function in different parallel calls + :param n_jobs: the number of workers + """ + args = np.asarray(args) + slices = _get_parallel_slices(len(args), n_jobs) + results = Parallel(n_jobs=n_jobs)( + delayed(func)(args[slice_i]) for slice_i in slices + ) + return list(itertools.chain.from_iterable(results))
+ + +
[docs]def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'): + """ + A wrapper of multiprocessing: + + >>> Parallel(n_jobs=n_jobs)( + >>> delayed(func)(args_i) for args_i in args + >>> ) + + that takes the `quapy.environ` variable as input silently. + Seeds the child processes to ensure reproducibility when n_jobs>1. + + :param func: callable + :param args: args of func + :param seed: the numeric seed + :param asarray: set to True to return a np.ndarray instead of a list + :param backend: indicates the backend used for handling parallel works + """ + def func_dec(environ, seed, *args): + qp.environ = environ.copy() + qp.environ['N_JOBS'] = 1 + #set a context with a temporal seed to ensure results are reproducibles in parallel + with ExitStack() as stack: + if seed is not None: + stack.enter_context(qp.util.temp_seed(seed)) + return func(*args) + + out = Parallel(n_jobs=n_jobs, backend=backend)( + delayed(func_dec)(qp.environ, None if seed is None else seed+i, args_i) for i, args_i in enumerate(args) + ) + if asarray: + out = np.asarray(out) + return out
+ + +
[docs]@contextlib.contextmanager +def temp_seed(random_state): + """ + Can be used in a "with" context to set a temporal seed without modifying the outer numpy's current state. E.g.: + + >>> with temp_seed(random_seed): + >>> pass # do any computation depending on np.random functionality + + :param random_state: the seed to set within the "with" context + """ + if random_state is not None: + state = np.random.get_state() + #save the seed just in case is needed (for instance for setting the seed to child processes) + qp.environ['_R_SEED'] = random_state + np.random.seed(random_state) + try: + yield + finally: + if random_state is not None: + np.random.set_state(state)
+ + +
[docs]def download_file(url, archive_filename): + """ + Downloads a file from a url + + :param url: the url + :param archive_filename: destination filename + """ + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + print("Downloading %s" % url) + urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) + print("")
+ + +
[docs]def download_file_if_not_exists(url, archive_filename): + """ + Dowloads a function (using :meth:`download_file`) if the file does not exist. + + :param url: the url + :param archive_filename: destination filename + """ + if os.path.exists(archive_filename): + return + create_if_not_exist(os.path.dirname(archive_filename)) + download_file(url, archive_filename)
+ + +
[docs]def create_if_not_exist(path): + """ + An alias to `os.makedirs(path, exist_ok=True)` that also returns the path. This is useful in cases like, e.g.: + + >>> path = create_if_not_exist(os.path.join(dir, subdir, anotherdir)) + + :param path: path to create + :return: the path itself + """ + os.makedirs(path, exist_ok=True) + return path
+ + +
[docs]def get_quapy_home(): + """ + Gets the home directory of QuaPy, i.e., the directory where QuaPy saves permanent data, such as dowloaded datasets. + This directory is `~/quapy_data` + + :return: a string representing the path + """ + home = os.path.join(str(Path.home()), 'quapy_data') + os.makedirs(home, exist_ok=True) + return home
+ + +
[docs]def create_parent_dir(path): + """ + Creates the parent dir (if any) of a given path, if not exists. E.g., for `./path/to/file.txt`, the path `./path/to` + is created. + + :param path: the path + """ + parentdir = Path(path).parent + if parentdir: + os.makedirs(parentdir, exist_ok=True)
+ + +
[docs]def save_text_file(path, text): + """ + Saves a text file to disk, given its full path, and creates the parent directory if missing. + + :param path: path where to save the path. + :param text: text to save. + """ + create_parent_dir(path) + with open(text, 'wt') as fout: + fout.write(text)
+ + +
[docs]def pickled_resource(pickle_path:str, generation_func:callable, *args): + """ + Allows for fast reuse of resources that are generated only once by calling generation_func(\\*args). The next times + this function is invoked, it loads the pickled resource. Example: + + >>> def some_array(n): # a mock resource created with one parameter (`n`) + >>> return np.random.rand(n) + >>> pickled_resource('./my_array.pkl', some_array, 10) # the resource does not exist: it is created by calling some_array(10) + >>> pickled_resource('./my_array.pkl', some_array, 10) # the resource exists; it is loaded from './my_array.pkl' + + :param pickle_path: the path where to save (first time) and load (next times) the resource + :param generation_func: the function that generates the resource, in case it does not exist in pickle_path + :param args: any arg that generation_func uses for generating the resources + :return: the resource + """ + if pickle_path is None: + return generation_func(*args) + else: + if os.path.exists(pickle_path): + return pickle.load(open(pickle_path, 'rb')) + else: + instance = generation_func(*args) + os.makedirs(str(Path(pickle_path).parent), exist_ok=True) + pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) + return instance
+ + +def _check_sample_size(sample_size): + if sample_size is None: + assert qp.environ['SAMPLE_SIZE'] is not None, \ + 'error: sample_size set to None, and cannot be resolved from the environment' + sample_size = qp.environ['SAMPLE_SIZE'] + assert isinstance(sample_size, int) and sample_size > 0, \ + 'error: sample_size is not a positive integer' + return sample_size + + +
[docs]class EarlyStop: + """ + A class implementing the early-stopping condition typically used for training neural networks. + + >>> earlystop = EarlyStop(patience=2, lower_is_better=True) + >>> earlystop(0.9, epoch=0) + >>> earlystop(0.7, epoch=1) + >>> earlystop.IMPROVED # is True + >>> earlystop(1.0, epoch=2) + >>> earlystop.STOP # is False (patience=1) + >>> earlystop(1.0, epoch=3) + >>> earlystop.STOP # is True (patience=0) + >>> earlystop.best_epoch # is 1 + >>> earlystop.best_score # is 0.7 + + :param patience: the number of (consecutive) times that a monitored evaluation metric (typically obtaind in a + held-out validation split) can be found to be worse than the best one obtained so far, before flagging the + stopping condition. An instance of this class is `callable`, and is to be used as follows: + :param lower_is_better: if True (default) the metric is to be minimized. + :ivar best_score: keeps track of the best value seen so far + :ivar best_epoch: keeps track of the epoch in which the best score was set + :ivar STOP: flag (boolean) indicating the stopping condition + :ivar IMPROVED: flag (boolean) indicating whether there was an improvement in the last call + """ + + def __init__(self, patience, lower_is_better=True): + + self.PATIENCE_LIMIT = patience + self.better = lambda a,b: a<b if lower_is_better else a>b + self.patience = patience + self.best_score = None + self.best_epoch = None + self.STOP = False + self.IMPROVED = False + + def __call__(self, watch_score, epoch): + """ + Commits the new score found in epoch `epoch`. If the score improves over the best score found so far, then + the patiente counter gets reset. If otherwise, the patience counter is decreased, and in case it reachs 0, + the flag STOP becomes True. + + :param watch_score: the new score + :param epoch: the current epoch + """ + self.IMPROVED = (self.best_score is None or self.better(watch_score, self.best_score)) + if self.IMPROVED: + self.best_score = watch_score + self.best_epoch = epoch + self.patience = self.PATIENCE_LIMIT + else: + self.patience -= 1 + if self.patience <= 0: + self.STOP = True
+ + +
[docs]@contextlib.contextmanager +def timeout(seconds): + """ + Opens a context that will launch an exception if not closed after a given number of seconds + + >>> def func(start_msg, end_msg): + >>> print(start_msg) + >>> sleep(2) + >>> print(end_msg) + >>> + >>> with timeout(1): + >>> func('begin function', 'end function') + >>> Out[] + >>> begin function + >>> TimeoutError + + + :param seconds: number of seconds, set to <=0 to ignore the timer + """ + if seconds > 0: + def handler(signum, frame): + raise TimeoutError() + + signal.signal(signal.SIGALRM, handler) + signal.alarm(seconds) + + yield + + if seconds > 0: + signal.alarm(0)
+ +
+ +
+
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/_static/_sphinx_javascript_frameworks_compat.js b/docs/build/html/_static/_sphinx_javascript_frameworks_compat.js new file mode 100644 index 0000000..8549469 --- /dev/null +++ b/docs/build/html/_static/_sphinx_javascript_frameworks_compat.js @@ -0,0 +1,134 @@ +/* + * _sphinx_javascript_frameworks_compat.js + * ~~~~~~~~~~ + * + * Compatability shim for jQuery and underscores.js. + * + * WILL BE REMOVED IN Sphinx 6.0 + * xref RemovedInSphinx60Warning + * + */ + +/** + * select a different prefix for underscore + */ +$u = _.noConflict(); + + +/** + * small helper function to urldecode strings + * + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL + */ +jQuery.urldecode = function(x) { + if (!x) { + return x + } + return decodeURIComponent(x.replace(/\+/g, ' ')); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && + !jQuery(node.parentNode).hasClass(className) && + !jQuery(node.parentNode).hasClass("nohighlight")) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + var bbox = node.parentElement.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} diff --git a/docs/build/html/_static/contents.png b/docs/build/html/_static/contents.png new file mode 100644 index 0000000..6c59aa1 Binary files /dev/null and b/docs/build/html/_static/contents.png differ diff --git a/docs/build/html/_static/css/badge_only.css b/docs/build/html/_static/css/badge_only.css new file mode 100644 index 0000000..c718cee --- /dev/null +++ b/docs/build/html/_static/css/badge_only.css @@ -0,0 +1 @@ +.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} \ No newline at end of file diff --git a/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff b/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff new file mode 100644 index 0000000..6cb6000 Binary files /dev/null and b/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff differ diff --git a/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 b/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 new file mode 100644 index 0000000..7059e23 Binary files /dev/null and b/docs/build/html/_static/css/fonts/Roboto-Slab-Bold.woff2 differ diff --git a/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff b/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff new file mode 100644 index 0000000..f815f63 Binary files /dev/null and b/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff differ diff --git a/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 b/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 new file mode 100644 index 0000000..f2c76e5 Binary files /dev/null and b/docs/build/html/_static/css/fonts/Roboto-Slab-Regular.woff2 differ diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.eot b/docs/build/html/_static/css/fonts/fontawesome-webfont.eot new file mode 100644 index 0000000..e9f60ca Binary files /dev/null and b/docs/build/html/_static/css/fonts/fontawesome-webfont.eot differ diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.svg b/docs/build/html/_static/css/fonts/fontawesome-webfont.svg new file mode 100644 index 0000000..855c845 --- /dev/null +++ b/docs/build/html/_static/css/fonts/fontawesome-webfont.svg @@ -0,0 +1,2671 @@ + + + + +Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 + By ,,, +Copyright Dave Gandy 2016. All rights reserved. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf b/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf new file mode 100644 index 0000000..35acda2 Binary files /dev/null and b/docs/build/html/_static/css/fonts/fontawesome-webfont.ttf differ diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.woff b/docs/build/html/_static/css/fonts/fontawesome-webfont.woff new file mode 100644 index 0000000..400014a Binary files /dev/null and b/docs/build/html/_static/css/fonts/fontawesome-webfont.woff differ diff --git a/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 b/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 new file mode 100644 index 0000000..4d13fc6 Binary files /dev/null and b/docs/build/html/_static/css/fonts/fontawesome-webfont.woff2 differ diff --git a/docs/build/html/_static/css/fonts/lato-bold-italic.woff b/docs/build/html/_static/css/fonts/lato-bold-italic.woff new file mode 100644 index 0000000..88ad05b Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-bold-italic.woff differ diff --git a/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 b/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 new file mode 100644 index 0000000..c4e3d80 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-bold-italic.woff2 differ diff --git a/docs/build/html/_static/css/fonts/lato-bold.woff b/docs/build/html/_static/css/fonts/lato-bold.woff new file mode 100644 index 0000000..c6dff51 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-bold.woff differ diff --git a/docs/build/html/_static/css/fonts/lato-bold.woff2 b/docs/build/html/_static/css/fonts/lato-bold.woff2 new file mode 100644 index 0000000..bb19504 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-bold.woff2 differ diff --git a/docs/build/html/_static/css/fonts/lato-normal-italic.woff b/docs/build/html/_static/css/fonts/lato-normal-italic.woff new file mode 100644 index 0000000..76114bc Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-normal-italic.woff differ diff --git a/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 b/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 new file mode 100644 index 0000000..3404f37 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-normal-italic.woff2 differ diff --git a/docs/build/html/_static/css/fonts/lato-normal.woff b/docs/build/html/_static/css/fonts/lato-normal.woff new file mode 100644 index 0000000..ae1307f Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-normal.woff differ diff --git a/docs/build/html/_static/css/fonts/lato-normal.woff2 b/docs/build/html/_static/css/fonts/lato-normal.woff2 new file mode 100644 index 0000000..3bf9843 Binary files /dev/null and b/docs/build/html/_static/css/fonts/lato-normal.woff2 differ diff --git a/docs/build/html/_static/css/theme.css b/docs/build/html/_static/css/theme.css new file mode 100644 index 0000000..19a446a --- /dev/null +++ b/docs/build/html/_static/css/theme.css @@ -0,0 +1,4 @@ +html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs>li{display:inline-block;padding-top:5px}.wy-breadcrumbs>li.wy-breadcrumbs-aside{float:right}.rst-content .wy-breadcrumbs>li code,.rst-content .wy-breadcrumbs>li tt,.wy-breadcrumbs>li .rst-content tt,.wy-breadcrumbs>li code{all:inherit;color:inherit}.breadcrumb-item:before{content:"/";color:#bbb;font-size:13px;padding:0 6px 0 3px}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content p a{overflow-wrap:anywhere}.rst-content .wy-table td p,.rst-content .wy-table td ul,.rst-content .wy-table th p,.rst-content .wy-table th ul,.rst-content table.docutils td p,.rst-content table.docutils td ul,.rst-content table.docutils th p,.rst-content table.docutils th ul,.rst-content table.field-list td p,.rst-content table.field-list td ul,.rst-content table.field-list th p,.rst-content table.field-list th ul{font-size:inherit}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .citation-reference>span.fn-bracket,.rst-content .footnote-reference>span.fn-bracket{display:none}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:auto minmax(80%,95%)}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{display:inline-grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{display:grid;grid-template-columns:auto auto minmax(.65rem,auto) minmax(40%,95%)}html.writer-html5 .rst-content aside.citation>span.label,html.writer-html5 .rst-content aside.footnote>span.label,html.writer-html5 .rst-content div.citation>span.label{grid-column-start:1;grid-column-end:2}html.writer-html5 .rst-content aside.citation>span.backrefs,html.writer-html5 .rst-content aside.footnote>span.backrefs,html.writer-html5 .rst-content div.citation>span.backrefs{grid-column-start:2;grid-column-end:3;grid-row-start:1;grid-row-end:3}html.writer-html5 .rst-content aside.citation>p,html.writer-html5 .rst-content aside.footnote>p,html.writer-html5 .rst-content div.citation>p{grid-column-start:4;grid-column-end:5}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{margin-bottom:24px}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.citation>dt>span.brackets:before,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.citation>dt>span.brackets:after,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a{word-break:keep-all}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a:not(:first-child):before,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.citation>dd p,html.writer-html5 .rst-content dl.footnote>dd p{font-size:.9rem}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{padding-left:1rem;padding-right:1rem;font-size:.9rem;line-height:1.2rem}html.writer-html5 .rst-content aside.citation p,html.writer-html5 .rst-content aside.footnote p,html.writer-html5 .rst-content div.citation p{font-size:.9rem;line-height:1.2rem;margin-bottom:12px}html.writer-html5 .rst-content aside.citation span.backrefs,html.writer-html5 .rst-content aside.footnote span.backrefs,html.writer-html5 .rst-content div.citation span.backrefs{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content aside.citation span.backrefs>a,html.writer-html5 .rst-content aside.footnote span.backrefs>a,html.writer-html5 .rst-content div.citation span.backrefs>a{word-break:keep-all}html.writer-html5 .rst-content aside.citation span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content aside.footnote span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content div.citation span.backrefs>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content aside.citation span.label,html.writer-html5 .rst-content aside.footnote span.label,html.writer-html5 .rst-content div.citation span.label{line-height:1.2rem}html.writer-html5 .rst-content aside.citation-list,html.writer-html5 .rst-content aside.footnote-list,html.writer-html5 .rst-content div.citation-list{margin-bottom:24px}html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content aside.footnote-list aside.footnote,html.writer-html5 .rst-content div.citation-list>div.citation,html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content aside.footnote-list aside.footnote code,html.writer-html5 .rst-content aside.footnote-list aside.footnote tt,html.writer-html5 .rst-content aside.footnote code,html.writer-html5 .rst-content aside.footnote tt,html.writer-html5 .rst-content div.citation-list>div.citation code,html.writer-html5 .rst-content div.citation-list>div.citation tt,html.writer-html5 .rst-content dl.citation code,html.writer-html5 .rst-content dl.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040;overflow-wrap:normal}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}.rst-content dl dd>ol:last-child,.rst-content dl dd>p:last-child,.rst-content dl dd>table:last-child,.rst-content dl dd>ul:last-child{margin-bottom:0}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel,.rst-content .menuselection{font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content .guilabel,.rst-content .menuselection{border:1px solid #7fbbe3;background:#e7f2fa}.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>.kbd,.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>kbd{color:inherit;font-size:80%;background-color:#fff;border:1px solid #a6a6a6;border-radius:4px;box-shadow:0 2px grey;padding:2.4px 6px;margin:auto 0}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block} \ No newline at end of file diff --git a/docs/build/html/_static/js/badge_only.js b/docs/build/html/_static/js/badge_only.js new file mode 100644 index 0000000..526d723 --- /dev/null +++ b/docs/build/html/_static/js/badge_only.js @@ -0,0 +1 @@ +!function(e){var t={};function r(n){if(t[n])return t[n].exports;var o=t[n]={i:n,l:!1,exports:{}};return e[n].call(o.exports,o,o.exports,r),o.l=!0,o.exports}r.m=e,r.c=t,r.d=function(e,t,n){r.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:n})},r.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return r.d(t,"a",t),t},r.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r.p="",r(r.s=4)}({4:function(e,t,r){}}); \ No newline at end of file diff --git a/docs/build/html/_static/js/html5shiv-printshiv.min.js b/docs/build/html/_static/js/html5shiv-printshiv.min.js new file mode 100644 index 0000000..2b43bd0 --- /dev/null +++ b/docs/build/html/_static/js/html5shiv-printshiv.min.js @@ -0,0 +1,4 @@ +/** +* @preserve HTML5 Shiv 3.7.3-pre | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed +*/ +!function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=y.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=y.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),y.elements=c+" "+a,j(b)}function f(a){var b=x[a[v]];return b||(b={},w++,a[v]=w,x[w]=b),b}function g(a,c,d){if(c||(c=b),q)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():u.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||t.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),q)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return y.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(y,b.frag)}function j(a){a||(a=b);var d=f(a);return!y.shivCSS||p||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),q||i(a,d),a}function k(a){for(var b,c=a.getElementsByTagName("*"),e=c.length,f=RegExp("^(?:"+d().join("|")+")$","i"),g=[];e--;)b=c[e],f.test(b.nodeName)&&g.push(b.applyElement(l(b)));return g}function l(a){for(var b,c=a.attributes,d=c.length,e=a.ownerDocument.createElement(A+":"+a.nodeName);d--;)b=c[d],b.specified&&e.setAttribute(b.nodeName,b.nodeValue);return e.style.cssText=a.style.cssText,e}function m(a){for(var b,c=a.split("{"),e=c.length,f=RegExp("(^|[\\s,>+~])("+d().join("|")+")(?=[[\\s,>+~#.:]|$)","gi"),g="$1"+A+"\\:$2";e--;)b=c[e]=c[e].split("}"),b[b.length-1]=b[b.length-1].replace(f,g),c[e]=b.join("}");return c.join("{")}function n(a){for(var b=a.length;b--;)a[b].removeNode()}function o(a){function b(){clearTimeout(g._removeSheetTimer),d&&d.removeNode(!0),d=null}var d,e,g=f(a),h=a.namespaces,i=a.parentWindow;return!B||a.printShived?a:("undefined"==typeof h[A]&&h.add(A),i.attachEvent("onbeforeprint",function(){b();for(var f,g,h,i=a.styleSheets,j=[],l=i.length,n=Array(l);l--;)n[l]=i[l];for(;h=n.pop();)if(!h.disabled&&z.test(h.media)){try{f=h.imports,g=f.length}catch(o){g=0}for(l=0;g>l;l++)n.push(f[l]);try{j.push(h.cssText)}catch(o){}}j=m(j.reverse().join("")),e=k(a),d=c(a,j)}),i.attachEvent("onafterprint",function(){n(e),clearTimeout(g._removeSheetTimer),g._removeSheetTimer=setTimeout(b,500)}),a.printShived=!0,a)}var p,q,r="3.7.3",s=a.html5||{},t=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,u=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,v="_html5shiv",w=0,x={};!function(){try{var a=b.createElement("a");a.innerHTML="",p="hidden"in a,q=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){p=!0,q=!0}}();var y={elements:s.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:r,shivCSS:s.shivCSS!==!1,supportsUnknownElements:q,shivMethods:s.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=y,j(b);var z=/^$|\b(?:all|print)\b/,A="html5shiv",B=!q&&function(){var c=b.documentElement;return!("undefined"==typeof b.namespaces||"undefined"==typeof b.parentWindow||"undefined"==typeof c.applyElement||"undefined"==typeof c.removeNode||"undefined"==typeof a.attachEvent)}();y.type+=" print",y.shivPrint=o,o(b),"object"==typeof module&&module.exports&&(module.exports=y)}("undefined"!=typeof window?window:this,document); \ No newline at end of file diff --git a/docs/build/html/_static/js/html5shiv.min.js b/docs/build/html/_static/js/html5shiv.min.js new file mode 100644 index 0000000..cd1c674 --- /dev/null +++ b/docs/build/html/_static/js/html5shiv.min.js @@ -0,0 +1,4 @@ +/** +* @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed +*/ +!function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3-pre",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); \ No newline at end of file diff --git a/docs/build/html/_static/js/theme.js b/docs/build/html/_static/js/theme.js new file mode 100644 index 0000000..1fddb6e --- /dev/null +++ b/docs/build/html/_static/js/theme.js @@ -0,0 +1 @@ +!function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("
"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + parent.insertBefore( + span, + parent.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(SphinxHighlight.highlightSearchWords); +_ready(SphinxHighlight.initEscapeListener); diff --git a/docs/build/html/_static/sphinxdoc.css b/docs/build/html/_static/sphinxdoc.css new file mode 100644 index 0000000..b03830b --- /dev/null +++ b/docs/build/html/_static/sphinxdoc.css @@ -0,0 +1,354 @@ +/* + * sphinxdoc.css_t + * ~~~~~~~~~~~~~~~ + * + * Sphinx stylesheet -- sphinxdoc theme. Originally created by + * Armin Ronacher for Werkzeug. + * + * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +@import url("basic.css"); + +/* -- page layout ----------------------------------------------------------- */ + +body { + font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', + 'Verdana', sans-serif; + font-size: 14px; + letter-spacing: -0.01em; + line-height: 150%; + text-align: center; + background-color: #BFD1D4; + color: black; + padding: 0; + border: 1px solid #aaa; + + margin: 0px 80px 0px 80px; + min-width: 740px; +} + +div.document { + background-color: white; + text-align: left; + background-image: url(contents.png); + background-repeat: repeat-x; +} + +div.documentwrapper { + float: left; + width: 100%; +} + +div.bodywrapper { + margin: 0 calc(230px + 10px) 0 0; + border-right: 1px solid #ccc; +} + +div.body { + margin: 0; + padding: 0.5em 20px 20px 20px; +} + +div.related { + font-size: 1em; +} + +div.related ul { + background-image: url(navigation.png); + height: 2em; + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; +} + +div.related ul li { + margin: 0; + padding: 0; + height: 2em; + float: left; +} + +div.related ul li.right { + float: right; + margin-right: 5px; +} + +div.related ul li a { + margin: 0; + padding: 0 5px 0 5px; + line-height: 1.75em; + color: #EE9816; +} + +div.related ul li a:hover { + color: #3CA8E7; +} + +div.sphinxsidebarwrapper { + padding: 0; +} + +div.sphinxsidebar { + padding: 0.5em 15px 15px 0; + width: calc(230px - 20px); + float: right; + font-size: 1em; + text-align: left; +} + +div.sphinxsidebar h3, div.sphinxsidebar h4 { + margin: 1em 0 0.5em 0; + font-size: 1em; + padding: 0.1em 0 0.1em 0.5em; + color: white; + border: 1px solid #86989B; + background-color: #AFC1C4; +} + +div.sphinxsidebar h3 a { + color: white; +} + +div.sphinxsidebar ul { + padding-left: 1.5em; + margin-top: 7px; + padding: 0; + line-height: 130%; +} + +div.sphinxsidebar ul ul { + margin-left: 20px; +} + +div.footer { + background-color: #E3EFF1; + color: #86989B; + padding: 3px 8px 3px 0; + clear: both; + font-size: 0.8em; + text-align: right; +} + +div.footer a { + color: #86989B; + text-decoration: underline; +} + +/* -- body styles ----------------------------------------------------------- */ + +p { + margin: 0.8em 0 0.5em 0; +} + +a { + color: #CA7900; + text-decoration: none; +} + +a:hover { + color: #2491CF; +} + +a:visited { + color: #551A8B; +} + +div.body a { + text-decoration: underline; +} + +h1 { + margin: 0; + padding: 0.7em 0 0.3em 0; + font-size: 1.5em; + color: #11557C; +} + +h2 { + margin: 1.3em 0 0.2em 0; + font-size: 1.35em; + padding: 0; +} + +h3 { + margin: 1em 0 -0.3em 0; + font-size: 1.2em; +} + +div.body h1 a, div.body h2 a, div.body h3 a, div.body h4 a, div.body h5 a, div.body h6 a { + color: black!important; +} + +h1 a.anchor, h2 a.anchor, h3 a.anchor, h4 a.anchor, h5 a.anchor, h6 a.anchor { + display: none; + margin: 0 0 0 0.3em; + padding: 0 0.2em 0 0.2em; + color: #aaa!important; +} + +h1:hover a.anchor, h2:hover a.anchor, h3:hover a.anchor, h4:hover a.anchor, +h5:hover a.anchor, h6:hover a.anchor { + display: inline; +} + +h1 a.anchor:hover, h2 a.anchor:hover, h3 a.anchor:hover, h4 a.anchor:hover, +h5 a.anchor:hover, h6 a.anchor:hover { + color: #777; + background-color: #eee; +} + +a.headerlink { + color: #c60f0f!important; + font-size: 1em; + margin-left: 6px; + padding: 0 4px 0 4px; + text-decoration: none!important; +} + +a.headerlink:hover { + background-color: #ccc; + color: white!important; +} + +cite, code, code { + font-family: 'Consolas', 'Deja Vu Sans Mono', + 'Bitstream Vera Sans Mono', monospace; + font-size: 0.95em; + letter-spacing: 0.01em; +} + +code { + background-color: #f2f2f2; + border-bottom: 1px solid #ddd; + color: #333; +} + +code.descname, code.descclassname, code.xref { + border: 0; +} + +hr { + border: 1px solid #abc; + margin: 2em; +} + +a code { + border: 0; + color: #CA7900; +} + +a code:hover { + color: #2491CF; +} + +pre { + font-family: 'Consolas', 'Deja Vu Sans Mono', + 'Bitstream Vera Sans Mono', monospace; + font-size: 0.95em; + letter-spacing: 0.015em; + line-height: 120%; + padding: 0.5em; + border: 1px solid #ccc; +} + +pre a { + color: inherit; + text-decoration: underline; +} + +td.linenos pre { + padding: 0.5em 0; +} + +div.quotebar { + background-color: #f8f8f8; + max-width: 250px; + float: right; + padding: 2px 7px; + border: 1px solid #ccc; +} + +nav.contents, +aside.topic, +div.topic { + background-color: #f8f8f8; +} + +table { + border-collapse: collapse; + margin: 0 -0.5em 0 -0.5em; +} + +table td, table th { + padding: 0.2em 0.5em 0.2em 0.5em; +} + +div.admonition, div.warning { + font-size: 0.9em; + margin: 1em 0 1em 0; + border: 1px solid #86989B; + background-color: #f7f7f7; + padding: 0; +} + +div.admonition p, div.warning p { + margin: 0.5em 1em 0.5em 1em; + padding: 0; +} + +div.admonition pre, div.warning pre { + margin: 0.4em 1em 0.4em 1em; +} + +div.admonition p.admonition-title, +div.warning p.admonition-title { + margin: 0; + padding: 0.1em 0 0.1em 0.5em; + color: white; + border-bottom: 1px solid #86989B; + font-weight: bold; + background-color: #AFC1C4; +} + +div.warning { + border: 1px solid #940000; +} + +div.warning p.admonition-title { + background-color: #CF0000; + border-bottom-color: #940000; +} + +div.admonition ul, div.admonition ol, +div.warning ul, div.warning ol { + margin: 0.1em 0.5em 0.5em 3em; + padding: 0; +} + +div.versioninfo { + margin: 1em 0 0 0; + border: 1px solid #ccc; + background-color: #DDEAF0; + padding: 8px; + line-height: 1.3em; + font-size: 0.9em; +} + +.viewcode-back { + font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva', + 'Verdana', sans-serif; +} + +div.viewcode-block:target { + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + +div.code-block-caption { + background-color: #ddd; + color: #222; + border: 1px solid #ccc; +} \ No newline at end of file diff --git a/docs/build/html/api.html b/docs/build/html/api.html new file mode 100644 index 0000000..7c74701 --- /dev/null +++ b/docs/build/html/api.html @@ -0,0 +1,113 @@ + + + + + + + API — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

API

+ + + + + + +

quapy

QuaPy module for quantification

+
+ + +
+
+
+ +
+ +
+

© Copyright 2024, Alejandro Moreo.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/generated/quapy.html b/docs/build/html/generated/quapy.html new file mode 100644 index 0000000..61ce026 --- /dev/null +++ b/docs/build/html/generated/quapy.html @@ -0,0 +1,106 @@ + + + + + + + quapy — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

quapy

+

QuaPy module for quantification

+
+ + +
+
+
+ +
+ +
+

© Copyright 2024, Alejandro Moreo.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/build/html/quapy.benchmarking.html b/docs/build/html/quapy.benchmarking.html new file mode 100644 index 0000000..ab3831f --- /dev/null +++ b/docs/build/html/quapy.benchmarking.html @@ -0,0 +1,119 @@ + + + + + + + quapy.benchmarking package — QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

quapy.benchmarking package

+
+

Submodules

+
+
+

quapy.benchmarking.typical module

+
+
+quapy.benchmarking.typical.wrap_cls_params(params)
+
+ +
+
+

Module contents

+
+
+ + +
+
+
+ +
+ +
+

© Copyright 2024, Alejandro Moreo.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/docs/leeme.txt b/docs/leeme.txt new file mode 100644 index 0000000..cabe457 --- /dev/null +++ b/docs/leeme.txt @@ -0,0 +1,10 @@ +Para meter los módulos dentro de doc hay que hacer un + +sphinx-apidoc -o docs/source/ quapy/ -P + +Eso importa todo lo que haya en quapy/ (incluidos los ficheros _ gracias a -P) en source y crea un rst para cada uno. + +Parece que lo del -P no funciona. Hay que meterlos a mano en quapy.method.rst + +Luego, simplemente +make html \ No newline at end of file diff --git a/docs/source/EUfooter.png b/docs/source/EUfooter.png new file mode 100644 index 0000000..0898c74 Binary files /dev/null and b/docs/source/EUfooter.png differ diff --git a/docs/source/index.md b/docs/source/index.md index ad9ac15..d52093e 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -24,19 +24,18 @@ The following script fetches a dataset of tweets, trains, applies, and evaluates ```python import quapy as qp -from sklearn.linear_model import LogisticRegression -dataset = qp.datasets.fetch_twitter('semeval16') +training, test = qp.datasets.fetch_UCIBinaryDataset("yeast").train_test # create an "Adjusted Classify & Count" quantifier -model = qp.method.aggregative.ACC(LogisticRegression()) -model.fit(dataset.training) +model = qp.method.aggregative.ACC() +Xtr, ytr = training.Xy +model.fit(Xtr, ytr) -estim_prevalence = model.predict(dataset.test.instances) -true_prevalence = dataset.test.prevalence() +estim_prevalence = model.predict(test.X) +true_prevalence = test.prevalence() error = qp.error.mae(true_prevalence, estim_prevalence) - print(f'Mean Absolute Error (MAE)={error:.3f}') ``` @@ -60,19 +59,19 @@ API ## Features -- Implementation of many popular quantification methods (Classify-&-Count and its variants, Expectation Maximization, quantification methods based on structured output learning, HDy, QuaNet, quantification ensembles, among others). -- Versatile functionality for performing evaluation based on sampling generation protocols (e.g., APP, NPP, etc.). -- Implementation of most commonly used evaluation metrics (e.g., AE, RAE, NAE, NRAE, SE, KLD, NKLD, etc.). -- Datasets frequently used in quantification (textual and numeric), including: - - 32 UCI Machine Learning binary datasets. - - 5 UCI Machine Learning multiclass datasets (new in v0.1.8!). - - 11 Twitter quantification-by-sentiment datasets. - - 3 product reviews quantification-by-sentiment datasets. - - 4 tasks from LeQua competition (new in v0.1.7!) - - IFCB dataset of plankton water samples (new in v0.1.8!). -- Native support for binary and single-label multiclass quantification scenarios. -- Model selection functionality that minimizes quantification-oriented loss functions. -- Visualization tools for analysing the experimental results. +* Implementation of many popular quantification methods (Classify-&-Count and its variants, Expectation Maximization, +quantification methods based on structured output learning, HDy, QuaNet, quantification ensembles, among others). +* Versatile functionality for performing evaluation based on sampling generation protocols (e.g., APP, NPP, etc.). +* Implementation of most commonly used evaluation metrics (e.g., AE, RAE, NAE, NRAE, SE, KLD, NKLD, etc.). +* Datasets frequently used in quantification (textual and numeric), including: + * 32 UCI Machine Learning datasets. + * 11 Twitter quantification-by-sentiment datasets. + * 3 product reviews quantification-by-sentiment datasets. + * 4 tasks from LeQua 2022 competition and 4 tasks from LeQua 2024 competition + * IFCB for Plancton quantification +* Native support for binary and single-label multiclass quantification scenarios. +* Model selection functionality that minimizes quantification-oriented loss functions. +* Visualization tools for analysing the experimental results. ## Citing QuaPy @@ -98,3 +97,7 @@ In case you want to contribute improvements to quapy, please generate pull reque :width: 250px :alt: SoBigData++ ``` + +This work has been supported by the QuaDaSh project +_"Finanziato dall’Unione europea---Next Generation EU, +Missione 4 Componente 2 CUP B53D23026250001"_. diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..cc5b4dc --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,41 @@ +.. QuaPy: A Python-based open-source framework for quantification documentation master file, created by + sphinx-quickstart on Wed Feb 7 16:26:46 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to QuaPy's documentation! +========================================================================================== + +QuaPy is a Python-based open-source framework for quantification. + +This document contains the API of the modules included in QuaPy. + +Installation +------------ + +`pip install quapy` + +GitHub +------------ + +QuaPy is hosted in GitHub at `https://github.com/HLT-ISTI/QuaPy `_ + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + +Contents +-------- + +.. toctree:: + + modules + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/manuals/datasets.md b/docs/source/manuals/datasets.md index f818aa3..b7d8827 100644 --- a/docs/source/manuals/datasets.md +++ b/docs/source/manuals/datasets.md @@ -340,10 +340,10 @@ and a set of test samples (for evaluation). QuaPy returns this data as a Labelle (training) and two generation protocols (for validation and test samples), as follows: ```python -training, val_generator, test_generator = fetch_lequa2022(task=task) +training, val_generator, test_generator = qp.datasets.fetch_lequa2022(task=task) ``` -See the `lequa2022_experiments.py` in the examples folder for further details on how to +See the `5a.lequa2022_experiments.py` in the examples folder for further details on how to carry out experiments using these datasets. The datasets are downloaded only once, and stored for fast reuse. @@ -365,6 +365,53 @@ Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022). A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify. ``` +## LeQua 2024 Datasets + +QuaPy also provides the datasets used for the [LeQua 2024 competition](https://lequa2024.github.io/). +In brief, there are 4 tasks: +* T1: binary quantification (by sentiment) +* T2: multiclass quantification (28 classes, merchandise products) +* T3: ordinal quantification (5-stars sentiment ratings) +* T4: binary sentiment quantification under a combination of covariate shift and prior shift + +In all cases, the covariate space has 256 dimensions (extracted using the `ELECTRA-Small` model). + +Every task consists of a training set, a set of validation samples (for model selection) +and a set of test samples (for evaluation). QuaPy returns this data as a LabelledCollection +(training bags) and sampling generation protocols (for validation and test bags). +T3 also offers the possibility to obtain a series of training bags (in form of a +sampling generation protocol) instead of one single training bag. Use it as follows: + +```python +training, val_generator, test_generator = qp.datasets.fetch_lequa2024(task=task) +``` + +See the `5b.lequa2024_experiments.py` in the examples folder for further details on how to +carry out experiments using these datasets. + +The datasets are downloaded only once, and stored for fast reuse. + +Some statistics are summarized below: + +| Dataset | classes | train size | validation samples | test samples | docs by sample | type | +|---------|:-------:|:-----------:|:------------------:|:------------:|:--------------:|:--------:| +| T1 | 2 | 5000 | 1000 | 5000 | 250 | vector | +| T2 | 28 | 20000 | 1000 | 5000 | 1000 | vector | +| T3 | 5 | 100 samples | 1000 | 5000 | 200 | vector | +| T4 | 2 | 5000 | 1000 | 5000 | 250 | vector | + +For further details on the datasets or the competition, we refer to +[the official site](https://lequa2024.github.io/data/) and +[the overview paper](http://nmis.isti.cnr.it/sebastiani/Publications/LQ2024.pdf). + +``` +Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022). +An Overview of LeQua 2024, the 2nd International Data Challenge on Learning to Quantify, +Proceedings of the 4th International Workshop on Learning to Quantify (LQ 2024), +ECML-PKDD 2024, Vilnius, Lithuania. +``` + + ## IFCB Plankton dataset IFCB is a dataset of plankton species in water samples hosted in `Zenodo `_. @@ -402,12 +449,20 @@ train, test_gen = qp.datasets.fetch_IFCB(for_model_selection=False, single_sampl # ... train and evaluation ``` +See also [Automatic plankton quantification using deep features +P González, A Castaño, EE Peacock, J Díez, JJ Del Coz, HM Sosik +Journal of Plankton Research 41 (4), 449-463](https://par.nsf.gov/servlets/purl/10172325). + ## Adding Custom Datasets +It is straightforward to import your own datasets into QuaPy. +I what follows, there are some code snippets for doing so; see also the example +[3.custom_collection.py](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/3.custom_collection.py). + QuaPy provides data loaders for simple formats dealing with -text, following the format: +text; for example, use `qp.data.reader.from_text` for the following the format: ``` class-id \t first document's pre-processed text \n @@ -415,13 +470,16 @@ class-id \t second document's pre-processed text \n ... ``` -and sparse representations of the form: +or `qp.data.reader.from_sparse` for sparse representations of the form: ``` {-1, 0, or +1} col(int):val(float) col(int):val(float) ... \n ... ``` +both functions return a tuple `X, y` containing a list of strings and the corresponding +labels, respectively. + The code in charge in loading a LabelledCollection is: ```python @@ -430,12 +488,13 @@ def load(cls, path:str, loader_func:callable): return LabelledCollection(*loader_func(path)) ``` -indicating that any _loader_func_ (e.g., a user-defined one) which +indicating that any `loader_func` (e.g., `from_text`, `from_sparse`, `from_csv`, or a user-defined one) which returns valid arguments for initializing a _LabelledCollection_ object will allow -to load any collection. In particular, the _LabelledCollection_ receives as -arguments the instances (as an iterable) and the labels (as an iterable) and, -additionally, the number of classes can be specified (it would otherwise be -inferred from the labels, but that requires at least one positive example for +to load any collection. More specifically, the _LabelledCollection_ receives as +arguments the _instances_ (iterable) and the _labels_ (iterable) and, +optionally, the number of classes (it would be +inferred from the labels if not indicated, but this requires at least one +positive example for all classes to be present in the collection). The same _loader_func_ can be passed to a Dataset, along with two @@ -448,20 +507,23 @@ import quapy as qp train_path = '../my_data/train.dat' test_path = '../my_data/test.dat' -def my_custom_loader(path): +def my_custom_loader(path, **custom_kwargs): with open(path, 'rb') as fin: ... return instances, labels -data = qp.data.Dataset.load(train_path, test_path, my_custom_loader) +data = qp.data.Dataset.load(train_path, test_path, my_custom_loader, **custom_kwargs) ``` ### Data Processing -QuaPy implements a number of preprocessing functions in the package _qp.data.preprocessing_, including: +QuaPy implements a number of preprocessing functions in the package `qp.data.preprocessing`, including: * _text2tfidf_: tfidf vectorization * _reduce_columns_: reducing the number of columns based on term frequency * _standardize_: transforms the column values into z-scores (i.e., subtract the mean and normalizes by the standard deviation, so that the column values have zero mean and unit variance). * _index_: transforms textual tokens into lists of numeric ids + +These functions are applied to `Dataset` objects, and offer the possibility to apply the transformation +inline (thus modifying the original dataset), or to return a modified copy. \ No newline at end of file diff --git a/docs/source/manuals/evaluation.md b/docs/source/manuals/evaluation.md index e5404a3..aba7068 100644 --- a/docs/source/manuals/evaluation.md +++ b/docs/source/manuals/evaluation.md @@ -46,18 +46,18 @@ e.g.: ```python qp.environ['SAMPLE_SIZE'] = 100 # once for all -true_prev = np.asarray([0.5, 0.3, 0.2]) # let's assume 3 classes -estim_prev = np.asarray([0.1, 0.3, 0.6]) +true_prev = [0.5, 0.3, 0.2] # let's assume 3 classes +estim_prev = [0.1, 0.3, 0.6] error = qp.error.mrae(true_prev, estim_prev) print(f'mrae({true_prev}, {estim_prev}) = {error:.3f}') ``` will print: ``` -mrae([0.500, 0.300, 0.200], [0.100, 0.300, 0.600]) = 0.914 +mrae([0.5, 0.3, 0.2], [0.1, 0.3, 0.6]) = 0.914 ``` -Finally, it is possible to instantiate QuaPy's quantification +It is also possible to instantiate QuaPy's quantification error functions from strings using, e.g.: ```python @@ -85,7 +85,7 @@ print(f'MAE = {mae:.4f}') ``` It is often desirable to evaluate our system using more than one -single evaluatio measure. In this case, it is convenient to generate +single evaluation measure. In this case, it is convenient to generate a _report_. A report in QuaPy is a dataframe accounting for all the true prevalence values with their corresponding prevalence values as estimated by the quantifier, along with the error each has given @@ -104,7 +104,7 @@ report['estim-prev'] = report['estim-prev'].map(F.strprev) print(report) print('Averaged values:') -print(report.mean()) +print(report.mean(numeric_only=True)) ``` This will produce an output like: @@ -141,11 +141,14 @@ true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, protocol=prot) All the evaluation functions implement specific optimizations for speeding-up the evaluation of aggregative quantifiers (i.e., of instances of _AggregativeQuantifier_). + The optimization comes down to generating classification predictions (either crisp or soft) only once for the entire test set, and then applying the sampling procedure to the predictions, instead of generating samples of instances and then computing the classification predictions every time. This is only possible when the protocol -is an instance of _OnLabelledCollectionProtocol_. The optimization is only +is an instance of _OnLabelledCollectionProtocol_. + +The optimization is only carried out when the number of classification predictions thus generated would be smaller than the number of predictions required for the entire protocol; e.g., if the original dataset contains 1M instances, but the protocol is such that it would @@ -156,4 +159,4 @@ precompute all the predictions irrespectively of the number of instances and num Finally, this can be deactivated by setting _aggr_speedup=False_. Note that this optimization is not only applied for the final evaluation, but also for the internal evaluations carried out during _model selection_. Since these are typically many, the heuristic can help reduce the -execution time a lot. \ No newline at end of file +execution time significatively. \ No newline at end of file diff --git a/docs/source/manuals/methods.md b/docs/source/manuals/methods.md index 3afa232..93aa1dd 100644 --- a/docs/source/manuals/methods.md +++ b/docs/source/manuals/methods.md @@ -1,7 +1,7 @@ # Quantification Methods Quantification methods can be categorized as belonging to -`aggregative` and `non-aggregative` groups. +`aggregative`, `non-aggregative`, and `meta-learning` groups. Most methods included in QuaPy at the moment are of type `aggregative` (though we plan to add many more methods in the near future), i.e., are methods characterized by the fact that @@ -12,21 +12,17 @@ Any quantifier in QuaPy shoud extend the class `BaseQuantifier`, and implement some abstract methods: ```python @abstractmethod - def fit(self, data: LabelledCollection): ... + def fit(self, X, y): ... @abstractmethod - def quantify(self, instances): ... + def predict(self, X): ... ``` The meaning of those functions should be familiar to those used to work with scikit-learn since the class structure of QuaPy is directly inspired by scikit-learn's _Estimators_. Functions -`fit` and `quantify` are used to train the model and to provide -class estimations (the reason why -scikit-learn' structure has not been adopted _as is_ in QuaPy responds to -the fact that scikit-learn's `predict` function is expected to return -one output for each input element --e.g., a predicted label for each -instance in a sample-- while in quantification the output for a sample -is one single array of class prevalences). +`fit` and `predict` (for which there is an alias `quantify`) +are used to train the model and to provide +class estimations. Quantifiers also extend from scikit-learn's `BaseEstimator`, in order to simplify the use of `set_params` and `get_params` used in [model selection](./model-selection). @@ -40,21 +36,26 @@ The methods that any `aggregative` quantifier must implement are: ```python @abstractmethod - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + def aggregation_fit(self, classif_predictions, labels): @abstractmethod - def aggregate(self, classif_predictions:np.ndarray): ... + def aggregate(self, classif_predictions): ... ``` -These two functions replace the `fit` and `quantify` methods, since those -come with default implementations. The `fit` function is provided and amounts to: +The argument `classif_predictions` is whatever the method `classify` returns. +QuaPy comes with default implementations that cover most common cases, but you can +override `classify` in case your method requires further or different information to work. + +These two functions replace the `fit` and `predict` methods, which +come with default implementations. For instance, the `fit` function is +provided and amounts to: ```python -def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): - self._check_init_parameters() - classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split) - self.aggregation_fit(classif_predictions, data) - return self + def fit(self, X, y): + self._check_init_parameters() + classif_predictions, labels = self.classifier_fit_predict(X, y) + self.aggregation_fit(classif_predictions, labels) + return self ``` Note that this function fits the classifier, and generates the predictions. This is assumed @@ -72,11 +73,11 @@ overriden (if needed) and allows the method to quickly raise any exception based found in the `__init__` arguments, thus avoiding to break after training the classifier and generating predictions. -Similarly, the function `quantify` is provided, and amounts to: +Similarly, the function `predict` (alias `quantify`) is provided, and amounts to: ```python -def quantify(self, instances): - classif_predictions = self.classify(instances) +def predict(self, X): + classif_predictions = self.classify(X) return self.aggregate(classif_predictions) ``` @@ -84,12 +85,14 @@ in which only the function `aggregate` is required to be overriden in most cases Aggregative quantifiers are expected to maintain a classifier (which is accessed through the `@property` `classifier`). This classifier is -given as input to the quantifier, and can be already fit -on external data (in which case, the `fit_learner` argument should -be set to False), or be fit by the quantifier's fit (default). +given as input to the quantifier, and will be trained by the quantifier's fit (default). +Alternatively, the classifier can be already fit on external data; in this case, the `fit_learner` +argument in the `__init__` should be set to False (see [4.using_pretrained_classifier.py](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/4.using_pretrained_classifier.py) +for a full code example). -The above patterns (in training: fit the classifier, then fit the aggregation; -in test: classify, then aggregate) allows QuaPy to optimize many internal procedures. +The above patterns (in training: (i) fit the classifier, then (ii) fit the aggregation; +in test: (i) classify, then (ii) aggregate) allows QuaPy to optimize many internal procedures, +on the grounds that steps (i) are slower than steps (ii). In particular, the model selection routing takes advantage of this two-step process and generates classifiers only for the valid combinations of hyperparameters of the classifier, and then _clones_ these classifiers and explores the combinations @@ -124,6 +127,7 @@ import quapy.functional as F from sklearn.svm import LinearSVC training, test = qp.datasets.fetch_twitter('hcr', pickle=True).train_test +Xtr, ytr = training.Xy # instantiate a classifier learner, in this case a SVM svm = LinearSVC() @@ -131,7 +135,7 @@ svm = LinearSVC() # instantiate a Classify & Count with the SVM # (an alias is available in qp.method.aggregative.ClassifyAndCount) model = qp.method.aggregative.CC(svm) -model.fit(training) +model.fit(Xtr, ytr) estim_prevalence = model.predict(test.instances) ``` @@ -153,26 +157,14 @@ predictions. This parameters can also be set with an integer, indicating that the parameters should be estimated by means of _k_-fold cross-validation, for which the integer indicates the number _k_ of folds (the default value is 5). Finally, `val_split` can be set to a -specific held-out validation set (i.e., an instance of `LabelledCollection`). - -The specification of `val_split` can be -postponed to the invokation of the fit method (if `val_split` was also -set in the constructor, the one specified at fit time would prevail), -e.g.: - -```python -model = qp.method.aggregative.ACC(svm) -# perform 5-fold cross validation for estimating ACC's parameters -# (overrides the default val_split=0.4 in the constructor) -model.fit(training, val_split=5) -``` +specific held-out validation set (i.e., an tuple `(X,y)`). The following code illustrates the case in which PCC is used: ```python model = qp.method.aggregative.PCC(svm) -model.fit(training) -estim_prevalence = model.predict(test.instances) +model.fit(Xtr, ytr) +estim_prevalence = model.predict(Xte) print('classifier:', model.classifier) ``` In this case, QuaPy will print: @@ -185,11 +177,11 @@ is not a probabilistic classifier (i.e., it does not implement the `predict_proba` method) and so, the classifier will be converted to a probabilistic one through [calibration](https://scikit-learn.org/stable/modules/calibration.html). As a result, the classifier that is printed in the second line points -to a `CalibratedClassifier` instance. Note that calibration can only -be applied to hard classifiers when `fit_learner=True`; an exception +to a `CalibratedClassifierCV` instance. Note that calibration can only +be applied to hard classifiers if `fit_learner=True`; an exception will be raised otherwise. -Lastly, everything we said aboud ACC and PCC +Lastly, everything we said about ACC and PCC applies to PACC as well. _New in v0.1.9_: quantifiers ACC and PACC now have three additional arguments: `method`, `solver` and `norm`: @@ -221,7 +213,7 @@ Options are: * `"condsoftmax"` applies softmax normalization only if the prevalence vector lies outside of the probability simplex. -#### BayesianCC (_New in v0.1.9_!) +#### BayesianCC The `BayesianCC` is a variant of ACC introduced in [Ziegler, A. and Czyż, P. "Bayesian quantification with black-box estimators", arXiv (2023)](https://arxiv.org/abs/2302.09159), @@ -259,29 +251,35 @@ An example of use can be found below: import quapy as qp from sklearn.linear_model import LogisticRegression -dataset = qp.datasets.fetch_twitter('hcr', pickle=True) +train, test = qp.datasets.fetch_twitter('hcr', pickle=True).train_test model = qp.method.aggregative.EMQ(LogisticRegression()) -model.fit(dataset.training) -estim_prevalence = model.predict(dataset.test.instances) +model.fit(*train.Xy) +estim_prevalence = model.predict(test.X) ``` -_New in v0.1.7_: EMQ now accepts two new parameters in the construction method, namely -`exact_train_prev` which allows to use the true training prevalence as the departing -prevalence estimation (default behaviour), or instead an approximation of it as +EMQ accepts additional parameters in the construction method: +* `exact_train_prev`: set to True for using the true training prevalence as the departing +prevalence estimation (default behaviour), or to False for using an approximation of it as suggested by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html) -(by setting `exact_train_prev=False`). -The other parameter is `recalib` which allows to indicate a calibration method, among those +* `calib`: allows to indicate a calibration method, among those proposed by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html), -including the Bias-Corrected Temperature Scaling, Vector Scaling, etc. -See the API documentation for further details. +including the Bias-Corrected Temperature Scaling +(`bcts`), Vector Scaling (`bcts`), No-Bias Temperature Scaling (`nbvs`), +or Temperature Scaling (`ts`); default is `None` (no calibration). +* `on_calib_error`: indicates the policy to follow in case the calibrator fails at runtime. + Options include `raise` (default), in which case a RuntimeException is raised; and `backup`, in which + case the calibrator is silently skipped. + +You can use the class method `EMQ_BCTS` to effortlessly instantiate EMQ with the best performing +heuristics found by [Alexandari et al. (2020)](http://proceedings.mlr.press/v119/alexandari20a.html). See the API documentation for further details. ### Hellinger Distance y (HDy) Implementation of the method based on the Hellinger Distance y (HDy) proposed by -[González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution -estimation based on the Hellinger distance. Information Sciences, 218:146–164.](https://www.sciencedirect.com/science/article/pii/S0020025512004069) +[González-Castro, V., Alaiz-Rodríguez, R., and Alegre, E. (2013). Class distribution +estimation based on the Hellinger distance. Information Sciences, 218:146-164.](https://www.sciencedirect.com/science/article/pii/S0020025512004069) It is implemented in `qp.method.aggregative.HDy` (also accessible through the allias `qp.method.aggregative.HellingerDistanceY`). @@ -289,11 +287,10 @@ This method works with a probabilistic classifier (hard classifiers can be used as well and will be calibrated) and requires a validation set to estimate parameter for the mixture model. Just like ACC and PACC, this quantifier receives a `val_split` argument -in the constructor (or in the fit method, in which case the previous -value is overridden) that can either be a float indicating the proportion +in the constructor that can either be a float indicating the proportion of training data to be taken as the validation set (in a random -stratified split), or a validation set (i.e., an instance of -`LabelledCollection`) itself. +stratified split), or the validation set itself (i.e., an tuple +`(X,y)`). HDy was proposed as a binary classifier and the implementation provided in QuaPy accepts only binary datasets. @@ -309,11 +306,11 @@ dataset = qp.datasets.fetch_reviews('hp', pickle=True) qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) model = qp.method.aggregative.HDy(LogisticRegression()) -model.fit(dataset.training) -estim_prevalence = model.predict(dataset.test.instances) +model.fit(*dataset.training.Xy) +estim_prevalence = model.predict(dataset.test.X) ``` -_New in v0.1.7:_ QuaPy now provides an implementation of the generalized +QuaPy also provides an implementation of the generalized "Distribution Matching" approaches for multiclass, inspired by the framework of [Firat (2016)](https://arxiv.org/abs/1606.00868). One can instantiate a variant of HDy for multiclass quantification as follows: @@ -322,17 +319,22 @@ a variant of HDy for multiclass quantification as follows: mutliclassHDy = qp.method.aggregative.DMy(classifier=LogisticRegression(), divergence='HD', cdf=False) ``` -_New in v0.1.7:_ QuaPy now provides an implementation of the "DyS" +QuaPy also provides an implementation of the "DyS" framework proposed by [Maletzke et al (2020)](https://ojs.aaai.org/index.php/AAAI/article/view/4376) and the "SMM" method proposed by [Hassan et al (2019)](https://ieeexplore.ieee.org/document/9260028) (thanks to _Pablo González_ for the contributions!) ### Threshold Optimization methods -_New in v0.1.7:_ QuaPy now implements Forman's threshold optimization methods; +QuaPy implements Forman's threshold optimization methods; see, e.g., [(Forman 2006)](https://dl.acm.org/doi/abs/10.1145/1150402.1150423) and [(Forman 2008)](https://link.springer.com/article/10.1007/s10618-008-0097-y). -These include: T50, MAX, X, Median Sweep (MS), and its variant MS2. +These include: `T50`, `MAX`, `X`, Median Sweep (`MS`), and its variant `MS2`. + +These methods are binary-only and implement different heuristics for +improving the stability of the denominator of the ACC adjustment (`tpr-fpr`). +The methods are called "threshold" since said heuristics have to do +with different choices of the underlying classifier's threshold. ### Explicit Loss Minimization @@ -415,16 +417,18 @@ model.fit(dataset.training) estim_prevalence = model.predict(dataset.test.instances) ``` -Check the examples on [explicit_loss_minimization](https://github.com/HLT-ISTI/QuaPy/blob/devel/examples/5.explicit_loss_minimization.py) +Check the examples on [explicit loss minimization](https://github.com/HLT-ISTI/QuaPy/blob/devel/examples/17.explicit_loss_minimization.py) and on [one versus all quantification](https://github.com/HLT-ISTI/QuaPy/blob/devel/examples/10.one_vs_all.py) for more details. +**Note** that the _one versus all_ approach is considered inappropriate under prior probability shift, though. ### Kernel Density Estimation methods (KDEy) -_New in v0.1.8_: QuaPy now provides implementations for the three variants +QuaPy provides implementations for the three variants of KDE-based methods proposed in -_[Moreo, A., González, P. and del Coz, J.J., 2023. +_[Moreo, A., González, P. and del Coz, J.J.. Kernel Density Estimation for Multiclass Quantification. -arXiv preprint arXiv:2401.00490.](https://arxiv.org/abs/2401.00490)_. +Machine Learning. Vol 114 (92), 2025](https://link.springer.com/article/10.1007/s10994-024-06726-5)_ +(a [preprint](https://arxiv.org/abs/2401.00490) is available online). The variants differ in the divergence metric to be minimized: - KDEy-HD: minimizes the (squared) Hellinger Distance and solves the problem via a Monte Carlo approach @@ -435,30 +439,42 @@ These methods are specifically devised for multiclass problems (although they ca binary problems too). All KDE-based methods depend on the hyperparameter `bandwidth` of the kernel. Typical values -that can be explored in model selection range in [0.01, 0.25]. The methods' performance -vary smoothing with smooth variations of this hyperparameter. +that can be explored in model selection range in [0.01, 0.25]. Previous experiments reveal the methods' performance +varies smoothly at small variations of this hyperparameter. ## Composable Methods -The [](quapy.method.composable) module allows the composition of quantification methods from loss functions and feature transformations. Any composed method solves a linear system of equations by minimizing the loss after transforming the data. Methods of this kind include ACC, PACC, HDx, HDy, and many other well-known methods, as well as an unlimited number of re-combinations of their building blocks. +The `quapy.method.composable` module integrates [qunfold](https://github.com/mirkobunse/qunfold) allows the composition +of quantification methods from loss functions and feature transformations (thanks to Mirko Bunse for the integration!). + +Any composed method solves a linear system of equations by minimizing the loss after transforming the data. Methods of this kind include ACC, PACC, HDx, HDy, and many other well-known methods, as well as an unlimited number of re-combinations of their building blocks. ### Installation ```sh pip install --upgrade pip setuptools wheel pip install "jax[cpu]" -pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4" +pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5" ``` +**Note:** since version 0.2.0, QuaPy is only compatible with qunfold >=0.1.5. + ### Basics The composition of a method is implemented through the [](quapy.method.composable.ComposableQuantifier) class. Its documentation also features an example to get you started in composing your own methods. ```python +from quapy.method.composable import ( + ComposableQuantifier, + TikhonovRegularized, + LeastSquaresLoss, + ClassRepresentation, +) + ComposableQuantifier( # ordinal ACC, as proposed by Bunse et al., 2022 - TikhonovRegularized(LeastSquaresLoss(), 0.01), - ClassTransformer(RandomForestClassifier(oob_score=True)) + TikhonovRegularized(LeastSquaresLoss(), 0.01), + ClassRepresentation(RandomForestClassifier(oob_score=True)) ) ``` @@ -485,16 +501,16 @@ You can use the [](quapy.method.composable.CombinedLoss) to create arbitrary, we ### Feature transformations -- [](quapy.method.composable.ClassTransformer) -- [](quapy.method.composable.DistanceTransformer) -- [](quapy.method.composable.HistogramTransformer) -- [](quapy.method.composable.EnergyKernelTransformer) -- [](quapy.method.composable.GaussianKernelTransformer) -- [](quapy.method.composable.LaplacianKernelTransformer) -- [](quapy.method.composable.GaussianRFFKernelTransformer) +- [](quapy.method.composable.ClassRepresentation) +- [](quapy.method.composable.DistanceRepresentation) +- [](quapy.method.composable.HistogramRepresentation) +- [](quapy.method.composable.EnergyKernelRepresentation) +- [](quapy.method.composable.GaussianKernelRepresentation) +- [](quapy.method.composable.LaplacianKernelRepresentation) +- [](quapy.method.composable.GaussianRFFKernelRepresentation) ```{hint} -The [](quapy.method.composable.ClassTransformer) requires the classifier to have a property `oob_score==True` and to produce a property `oob_decision_function` during fitting. In [scikit-learn](https://scikit-learn.org/), this requirement is fulfilled by any bagging classifier, such as random forests. Any other classifier needs to be cross-validated through the [](quapy.method.composable.CVClassifier). +The [](quapy.method.composable.ClassRepresentation) requires the classifier to have a property `oob_score==True` and to produce a property `oob_decision_function` during fitting. In [scikit-learn](https://scikit-learn.org/), this requirement is fulfilled by any bagging classifier, such as random forests. Any other classifier needs to be cross-validated through the [](quapy.method.composable.CVClassifier). ``` @@ -529,10 +545,11 @@ from quapy.method.meta import Ensemble from sklearn.linear_model import LogisticRegression dataset = qp.datasets.fetch_UCIBinaryDataset('haberman') +train, test = dataset.train_test model = Ensemble(quantifier=ACC(LogisticRegression()), size=30, policy='ave', n_jobs=-1) -model.fit(dataset.training) -estim_prevalence = model.predict(dataset.test.instances) +model.fit(*train.Xy) +estim_prevalence = model.predict(test.X) ``` Other aggregation policies implemented in QuaPy include: @@ -579,7 +596,32 @@ learner = NeuralClassifierTrainer(cnn, device='cuda') # train QuaNet model = QuaNet(learner, device='cuda') -model.fit(dataset.training) -estim_prevalence = model.predict(dataset.test.instances) +model.fit(*dataset.training.Xy) +estim_prevalence = model.predict(dataset.test.X) ``` +## Confidence Regions for Class Prevalence Estimation + +_(New in v0.2.0!)_ Some quantification methods go beyond providing a single point estimate of class prevalence values and also produce confidence regions, which characterize the uncertainty around the point estimate. In QuaPy, two such methods are currently implemented: + +* Aggregative Bootstrap: The Aggregative Bootstrap method extends any aggregative quantifier by generating confidence regions for class prevalence estimates through bootstrapping. The method is described in the paper [Moreo, A., Salvati, N. + An Efficient Method for Deriving Confidence Intervals in Aggregative Quantification. + Learning to Quantify: Methods and Applications (LQ 2025), co-located at ECML-PKDD 2025. + pp 12-33, Porto (Portugal)](https://lq-2025.github.io/proceedings/CompleteVolume.pdf). Key features of this method include: + + * Optimized Computation: The bootstrap is applied to pre-classified instances, significantly speeding up training and inference. +During training, bootstrap repetitions are performed only after training the classifier once. These repetitions are used to train multiple aggregation functions. +During inference, bootstrap is applied over pre-classified test instances. + * General Applicability: Aggregative Bootstrap can be applied to any aggregative quantifier. + For further information, check the [example](https://github.com/HLT-ISTI/QuaPy/tree/master/examples/16.confidence_regions.py) provided. + +* BayesianCC: is a Bayesian variant of the Adjusted Classify & Count (ACC) quantifier; see more details in the [example](https://github.com/HLT-ISTI/QuaPy/tree/master/examples/14.bayesian_quantification.py) provided. + +Confidence regions are constructed around a point estimate, which is typically computed as the mean value of a set of samples. +The confidence region can be instantiated in three ways: +* Confidence intervals: are standard confidence intervals generated for each class independently (_method="intervals"_). +* Confidence ellipse in the simplex: an ellipse constructed around the mean point; the ellipse lies on the simplex and takes + into account possible inter-class dependencies in the data (_method="ellipse"_). +* Confidence ellipse in the Centered-Log Ratio (CLR) space: the underlying assumption of the ellipse is that the components are + normally distributed. However, we know elements from the simplex have an inner structure. A better approach is to first + transform the components into an unconstrained space (the CLR), and then construct the ellipse in such space (_method="ellipse-clr"_). \ No newline at end of file diff --git a/docs/source/manuals/model-selection.md b/docs/source/manuals/model-selection.md index 097f902..6470ebf 100644 --- a/docs/source/manuals/model-selection.md +++ b/docs/source/manuals/model-selection.md @@ -87,7 +87,7 @@ model = qp.model_selection.GridSearchQ( error='mae', # the error to optimize is the MAE (a quantification-oriented loss) refit=True, # retrain on the whole labelled set once done verbose=True # show information as the process goes on -).fit(training) +).fit(*training.Xy) print(f'model selection ended: best hyper-parameters={model.best_params_}') model = model.best_model_ @@ -133,7 +133,7 @@ learner = GridSearchCV( LogisticRegression(), param_grid={'C': np.logspace(-4, 5, 10), 'class_weight': ['balanced', None]}, cv=5) -model = DistributionMatching(learner).fit(dataset.train) +model = DistributionMatching(learner).fit(*dataset.train.Xy) ``` However, this is conceptually flawed, since the model should be diff --git a/docs/source/manuals/plotting.md b/docs/source/manuals/plotting.md index ec080da..67f9f16 100644 --- a/docs/source/manuals/plotting.md +++ b/docs/source/manuals/plotting.md @@ -2,6 +2,9 @@ The module _qp.plot_ implements some basic plotting functions that can help analyse the performance of a quantification method. +See the provided +[code example](https://github.com/HLT-ISTI/QuaPy/blob/master/examples/13.plotting.py) +for a full example. All plotting functions receive as inputs the outcomes of some experiments and include, for each experiment, @@ -77,7 +80,7 @@ def gen_data(): method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], [] for method_name, model in models(): - model.fit(train) + model.fit(*train.Xy) true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) method_names.append(method_name) @@ -171,7 +174,7 @@ def gen_data(): training_size = 5000 # since the problem is binary, it suffices to specify the negative prevalence, since the positive is constrained train_sample = train.sampling(training_size, 1-training_prevalence) - model.fit(train_sample) + model.fit(*train_sample.Xy) true_prev, estim_prev = qp.evaluation.prediction(model, APP(test, repeats=100, random_state=0)) method_name = 'CC$_{'+f'{int(100*training_prevalence)}' + '\%}$' method_data.append((method_name, true_prev, estim_prev, train_sample.prevalence())) diff --git a/docs/source/manuals/protocols.md b/docs/source/manuals/protocols.md index 1d6193e..17bc41a 100644 --- a/docs/source/manuals/protocols.md +++ b/docs/source/manuals/protocols.md @@ -1,7 +1,5 @@ # Protocols -_New in v0.1.7!_ - Quantification methods are expected to behave robustly in the presence of shift. For this reason, quantification methods need to be confronted with samples exhibiting widely varying amounts of shift. @@ -106,15 +104,16 @@ train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test # model selection train, val = train.split_stratified(train_prop=0.75) +Xtr, ytr = train.Xy quantifier = qp.model_selection.GridSearchQ( quantifier, param_grid={'classifier__C': np.logspace(-2, 2, 5)}, protocol=APP(val) # <- this is the protocol we use for generating validation samples -).fit(train) +).fit(Xtr, ytr) # default values are n_prevalences=21, repeats=10, random_state=0; this is equialent to: # val_app = APP(val, n_prevalences=21, repeats=10, random_state=0) -# quantifier = GridSearchQ(quantifier, param_grid, protocol=val_app).fit(train) +# quantifier = GridSearchQ(quantifier, param_grid, protocol=val_app).fit(Xtr, ytr) # evaluation with APP mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae') diff --git a/docs/source/modules.rst b/docs/source/modules.rst new file mode 100644 index 0000000..5d84a54 --- /dev/null +++ b/docs/source/modules.rst @@ -0,0 +1,7 @@ +quapy +===== + +.. toctree:: + :maxdepth: 4 + + quapy diff --git a/docs/source/quapy.method.rst b/docs/source/quapy.method.rst index ac0dfc8..88fcc7d 100644 --- a/docs/source/quapy.method.rst +++ b/docs/source/quapy.method.rst @@ -60,6 +60,14 @@ quapy.method.composable module :undoc-members: :show-inheritance: +quapy.method.confidence module +------------------------------ + +.. automodule:: quapy.method.confidence + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/examples/10.one_vs_all.py b/examples/10.one_vs_all.py index 3f5c4ac..ca70662 100644 --- a/examples/10.one_vs_all.py +++ b/examples/10.one_vs_all.py @@ -9,6 +9,11 @@ import numpy as np """ In this example, we will create a quantifier for tweet sentiment analysis considering three classes: negative, neutral, and positive. We will use a one-vs-all approach using a binary quantifier for demonstration purposes. + +Caveat: the one-vs-all approach is deemed inadequate under prior probability shift conditions. The reasons +are discussed in: +Donyavi, Z., Serapio, A., & Batista, G. (2023). MC-SQ: A highly accurate ensemble for multi-class quantifi- +cation. In: Proceedings of the 2023 SIAM International Conference on Data Mining (SDM), SIAM, pp. 622–630 """ qp.environ['SAMPLE_SIZE'] = 100 @@ -40,11 +45,11 @@ param_grid = { } print('starting model selection') model_selection = GridSearchQ(quantifier, param_grid, protocol=UPP(val), verbose=True, refit=False) -quantifier = model_selection.fit(train_modsel).best_model() +quantifier = model_selection.fit(*train_modsel.Xy).best_model() print('training on the whole training set') train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle=True).train_test -quantifier.fit(train) +quantifier.fit(*train.Xy) # evaluation mae = qp.evaluation.evaluate(quantifier, protocol=UPP(test), error_metric='mae') diff --git a/examples/11.comparing_HDy_HDx.py b/examples/11.comparing_HDy_HDx.py index 7d96b6a..a95b780 100644 --- a/examples/11.comparing_HDy_HDx.py +++ b/examples/11.comparing_HDy_HDx.py @@ -23,8 +23,9 @@ qp.environ['SAMPLE_SIZE']=100 df = pd.DataFrame(columns=['method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time']) +datasets = qp.datasets.UCI_BINARY_DATASETS -for dataset_name in tqdm(qp.datasets.UCI_BINARY_DATASETS, total=len(qp.datasets.UCI_BINARY_DATASETS)): +for dataset_name in tqdm(datasets, total=len(datasets), desc='datasets processed'): if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']: # these datasets tend to produce either too good or too bad results... continue @@ -32,23 +33,25 @@ for dataset_name in tqdm(qp.datasets.UCI_BINARY_DATASETS, total=len(qp.datasets. collection = qp.datasets.fetch_UCIBinaryLabelledCollection(dataset_name, verbose=False) train, test = collection.split_stratified() + Xtr, ytr = train.Xy + # HDy............................................ tinit = time() - hdy = HDy(LogisticRegression()).fit(train) + hdy = HDy(LogisticRegression()).fit(Xtr, ytr) t_hdy_train = time()-tinit tinit = time() - hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean() + hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean(numeric_only=True) t_hdy_test = time() - tinit df.loc[len(df)] = ['HDy', dataset_name, hdy_report['mae'], hdy_report['mrae'], t_hdy_train, t_hdy_test] # HDx............................................ tinit = time() - hdx = DMx.HDx(n_jobs=-1).fit(train) + hdx = DMx.HDx(n_jobs=-1).fit(Xtr, ytr) t_hdx_train = time() - tinit tinit = time() - hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean() + hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean(numeric_only=True) t_hdx_test = time() - tinit df.loc[len(df)] = ['HDx', dataset_name, hdx_report['mae'], hdx_report['mrae'], t_hdx_train, t_hdx_test] diff --git a/examples/12.custom_protocol.py b/examples/12.custom_protocol.py index 7824b3f..774a0ed 100644 --- a/examples/12.custom_protocol.py +++ b/examples/12.custom_protocol.py @@ -3,14 +3,13 @@ from sklearn.linear_model import LogisticRegression import quapy as qp from quapy.method.aggregative import PACC -from quapy.data import LabelledCollection from quapy.protocol import AbstractStochasticSeededProtocol import quapy.functional as F """ In this example, we create a custom protocol. -The protocol generates samples of a Gaussian mixture model with random mixture parameter (the sample prevalence). -Datapoints are univariate and we consider 2 classes only. +The protocol generates synthetic samples of a Gaussian mixture model with random mixture parameter +(the sample prevalence). Datapoints are univariate and we consider 2 classes only for simplicity. """ class GaussianMixProtocol(AbstractStochasticSeededProtocol): # We need to extend AbstractStochasticSeededProtocol if we want the samples to be replicable @@ -81,10 +80,9 @@ with qp.util.temp_seed(0): Xpos = np.random.normal(loc=mu_2, scale=std_2, size=100) X = np.concatenate([Xneg, Xpos]).reshape(-1,1) y = [0]*100 + [1]*100 - training = LabelledCollection(X, y) pacc = PACC(LogisticRegression()) - pacc.fit(training) + pacc.fit(X, y) mae = qp.evaluation.evaluate(pacc, protocol=gm, error_metric='mae', verbose=True) diff --git a/examples/14.bayesian_quantification.py b/examples/14.bayesian_quantification.py index 667149b..21a1be1 100644 --- a/examples/14.bayesian_quantification.py +++ b/examples/14.bayesian_quantification.py @@ -122,7 +122,7 @@ def get_random_forest() -> RandomForestClassifier: def _get_estimate(estimator_class, training: LabelledCollection, test: np.ndarray) -> None: """Auxiliary method for running ACC and PACC.""" estimator = estimator_class(get_random_forest()) - estimator.fit(training) + estimator.fit(*training.Xy) return estimator.predict(test) @@ -130,7 +130,7 @@ def train_and_plot_bayesian_quantification(ax: plt.Axes, training: LabelledColle """Fits Bayesian quantification and plots posterior mean as well as individual samples""" print('training model Bayesian CC...', end='') quantifier = BayesianCC(classifier=get_random_forest()) - quantifier.fit(training) + quantifier.fit(*training.Xy) # Obtain mean prediction mean_prediction = quantifier.predict(test.X) diff --git a/examples/15.composable_methods.py b/examples/15.composable_methods.py index 5ffcb94..df3b34c 100644 --- a/examples/15.composable_methods.py +++ b/examples/15.composable_methods.py @@ -1,6 +1,6 @@ """ This example illustrates the composition of quantification methods from -arbitrary loss functions and feature transformations. It will extend the basic +arbitrary loss functions and feature representations. It will extend the basic example on the usage of quapy with this composition. This example requires the installation of qunfold, the back-end of QuaPy's @@ -8,7 +8,7 @@ composition module: pip install --upgrade pip setuptools wheel pip install "jax[cpu]" - pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4" + pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5" """ import numpy as np @@ -22,22 +22,23 @@ data = qp.data.preprocessing.text2tfidf( min_df = 5, ) training, testing = data.train_test +Xtr, ytr = training.Xy # We start by recovering PACC from its building blocks, a LeastSquaresLoss and -# a probabilistic ClassTransformer. A 5-fold cross-validation is implemented +# a probabilistic ClassRepresentation. A 5-fold cross-validation is implemented # through a CVClassifier. from quapy.method.composable import ( ComposableQuantifier, LeastSquaresLoss, - ClassTransformer, + ClassRepresentation, CVClassifier, ) from sklearn.linear_model import LogisticRegression pacc = ComposableQuantifier( LeastSquaresLoss(), - ClassTransformer( + ClassRepresentation( CVClassifier(LogisticRegression(random_state=0), 5), is_probabilistic = True ), @@ -46,7 +47,7 @@ pacc = ComposableQuantifier( # Let's evaluate this quantifier. print(f"Evaluating PACC: {pacc}") -pacc.fit(training) +pacc.fit(Xtr, ytr) app = qp.protocol.APP(testing, sample_size=100, n_prevalences=21, repeats=1) absolute_errors = qp.evaluation.evaluate( model = pacc, @@ -63,14 +64,14 @@ from quapy.method.composable import HellingerSurrogateLoss model = ComposableQuantifier( HellingerSurrogateLoss(), # the loss is different from before - ClassTransformer( # we use the same transformer + ClassRepresentation( # we use the same representation CVClassifier(LogisticRegression(random_state=0), 5), is_probabilistic = True ), ) print(f"Evaluating {model}") -model.fit(training) +model.fit(Xtr, ytr) absolute_errors = qp.evaluation.evaluate( model = model, protocol = app, # use the same protocol for evaluation @@ -79,7 +80,7 @@ absolute_errors = qp.evaluation.evaluate( print(f"MAE = {np.mean(absolute_errors):.4f}+-{np.std(absolute_errors):.4f}") # In general, any composed method solves a linear system of equations by -# minimizing the loss after transforming the data. Methods of this kind include +# minimizing the loss after representing the data. Methods of this kind include # ACC, PACC, HDx, HDy, and many other well-known methods, as well as an # unlimited number of re-combinations of their building blocks. @@ -93,18 +94,18 @@ from quapy.method.composable import CombinedLoss model = ComposableQuantifier( CombinedLoss(HellingerSurrogateLoss(), LeastSquaresLoss()), - ClassTransformer( + ClassRepresentation( CVClassifier(LogisticRegression(random_state=0), 5), is_probabilistic = True ), ) -from qunfold.quapy import QuaPyWrapper -from qunfold import GenericMethod +from quapy.method.composable import QUnfoldWrapper +from qunfold import LinearMethod -model = QuaPyWrapper(GenericMethod( +model = QUnfoldWrapper(LinearMethod( CombinedLoss(HellingerSurrogateLoss(), LeastSquaresLoss()), - ClassTransformer( + ClassRepresentation( CVClassifier(LogisticRegression(random_state=0), 5), is_probabilistic = True ), @@ -115,7 +116,7 @@ model = QuaPyWrapper(GenericMethod( param_grid = { "loss__weights": [ (w, 1-w) for w in [.1, .5, .9] ], - "transformer__classifier__estimator__C": [1e-1, 1e1], + "representation__classifier__estimator__C": [1e-1, 1e1], } grid_search = qp.model_selection.GridSearchQ( @@ -125,7 +126,7 @@ grid_search = qp.model_selection.GridSearchQ( error = "mae", refit = False, verbose = True, -).fit(training) +).fit(Xtr, ytr) print( f"Best hyper-parameters = {grid_search.best_params_}", f"Best MAE = {grid_search.best_score_}", diff --git a/examples/16.KDEy_bandwidth.py b/examples/16.KDEy_bandwidth.py new file mode 100644 index 0000000..cc81ade --- /dev/null +++ b/examples/16.KDEy_bandwidth.py @@ -0,0 +1,83 @@ +import quapy as qp +import numpy as np +from quapy.protocol import UPP +from quapy.method.aggregative import KDEyML +import quapy.functional as F +from time import time + +""" +Let see one example: +""" + +# load some data +qp.environ['SAMPLE_SIZE'] = 100 +data = qp.datasets.fetch_UCIMulticlassDataset('molecular') +training, test = data.train_test +training, validation = training.split_stratified(train_prop=0.7, random_state=0) +protocol = UPP(validation) + +hyper_C = np.logspace(-3, 3, 7) + +model = KDEyML() + +with qp.util.temp_seed(0): + + param_grid = { + 'classifier__C': hyper_C, + 'bandwidth': np.linspace(0.01, 0.20, 20) # [0.01, 0.02, 0.03, ..., 0.20] + } + + model = qp.model_selection.GridSearchQ( + model=model, + param_grid=param_grid, + protocol=protocol, + error='mae', # the error to optimize is the MAE (a quantification-oriented loss) + refit=False, # retrain on the whole labelled set once done + n_jobs=-1, + verbose=True # show information as the process goes on + ).fit(training) + +best_params = model.best_params_ +took = model.fit_time_ +model = model.best_model_ +print(f'model selection ended: best hyper-parameters={best_params}') + +# evaluation in terms of MAE +# we use the same evaluation protocol (APP) on the test set +mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae') + +print(f'MAE={mae_score:.5f}') +print(f'model selection took {took:.1f}s') + + +model = KDEyML(bandwidth='auto') + +with qp.util.temp_seed(0): + + param_grid = { + 'classifier__C': hyper_C, + } + + model = qp.model_selection.GridSearchQ( + model=model, + param_grid=param_grid, + protocol=protocol, + error='mae', # the error to optimize is the MAE (a quantification-oriented loss) + refit=False, # retrain on the whole labelled set once done + n_jobs=-1, + verbose=True # show information as the process goes on + ).fit(training) + +best_params = model.best_params_ +took = model.fit_time_ +model = model.best_model_ +bandwidth = model.bandwidth_val +print(f'model selection ended: best hyper-parameters={best_params} ({bandwidth=})') + +# evaluation in terms of MAE +# we use the same evaluation protocol (APP) on the test set +mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae') + +print(f'MAE={mae_score:.5f}') +print(f'model selection took {took:.1f}s') + diff --git a/examples/16.confidence_regions.py b/examples/16.confidence_regions.py index f177e69..c8e95dd 100644 --- a/examples/16.confidence_regions.py +++ b/examples/16.confidence_regions.py @@ -1,4 +1,3 @@ -from quapy.method.confidence import BayesianCC from quapy.method.confidence import AggregativeBootstrap from quapy.method.aggregative import PACC import quapy.functional as F @@ -21,15 +20,15 @@ Let see one example: # load some data data = qp.datasets.fetch_UCIMulticlassDataset('molecular') train, test = data.train_test +Xtr, ytr = train.Xy # by simply wrapping an aggregative quantifier within the AggregativeBootstrap class, we can obtain confidence # intervals around the point estimate, in this case, at 95% of confidence pacc = AggregativeBootstrap(PACC(), n_test_samples=500, confidence_level=0.95) - with qp.util.temp_seed(0): # we train the quantifier the usual way - pacc.fit(train) + pacc.fit(Xtr, ytr) # let us simulate some shift in the test data random_prevalence = F.uniform_prevalence_sampling(n_classes=test.n_classes) @@ -37,7 +36,7 @@ with qp.util.temp_seed(0): true_prev = shifted_test.prevalence() # by calling "quantify_conf", we obtain the point estimate and the confidence intervals around it - pred_prev, conf_intervals = pacc.quantify_conf(shifted_test.X) + pred_prev, conf_intervals = pacc.predict_conf(shifted_test.X) # conf_intervals is an instance of ConfidenceRegionABC, which provides some useful utilities like: # - coverage: a function which computes the fraction of true values that belong to the confidence region @@ -53,7 +52,7 @@ with qp.util.temp_seed(0): print(f'point-estimate: {F.strprev(pred_prev)}') print(f'absolute error: {error:.3f}') print(f'Is the true value in the confidence region?: {conf_intervals.coverage(true_prev)==1}') - print(f'Proportion of simplex covered at {pacc.confidence_level*100:.1f}%: {conf_intervals.simplex_portion()*100:.2f}%') + print(f'Proportion of simplex covered at confidence level {pacc.confidence_level*100:.1f}%: {conf_intervals.simplex_portion()*100:.2f}%') """ Final remarks: diff --git a/examples/5.explicit_loss_minimization.py b/examples/17.explicit_loss_minimization.py similarity index 100% rename from examples/5.explicit_loss_minimization.py rename to examples/17.explicit_loss_minimization.py diff --git a/examples/18.ReadMe_for_text_analysis.py b/examples/18.ReadMe_for_text_analysis.py new file mode 100644 index 0000000..7a70022 --- /dev/null +++ b/examples/18.ReadMe_for_text_analysis.py @@ -0,0 +1,60 @@ +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_selection import SelectKBest, chi2 + +import quapy as qp +from quapy.method.non_aggregative import ReadMe +import quapy.functional as F +from sklearn.pipeline import Pipeline + +""" +This example showcases how to use the non-aggregative method ReadMe proposed by Hopkins and King. +This method is for text analysis, so let us first instantiate a dataset for sentiment quantification (we +use IMDb for this example). The method is quite computationally expensive, so we will restrict the training +set to 1000 documents only. +""" +reviews = qp.datasets.fetch_reviews('imdb').reduce(n_train=1000, random_state=0) + +""" +We need to convert text to bag-of-words representations. Actually, ReadMe requires the representations to be +binary (i.e., storing a 1 whenever a document contains certain word, or 0 otherwise), so we will not use +TFIDF weighting. We will also retain the top 1000 most important features according to chi2. +""" +encode_0_1 = Pipeline([ + ('0_1_terms', CountVectorizer(min_df=5, binary=True)), + ('feat_sel', SelectKBest(chi2, k=1000)) +]) +train, test = qp.data.preprocessing.instance_transformation(reviews, encode_0_1, inplace=True).train_test + +""" +We now instantiate ReadMe, with the prob_model='full' (default behaviour, implementing the Hopkins and King original +idea). This method consists of estimating Q(Y) by solving: + +Q(X) = \sum_i Q(X|Y=i) Q(Y=i) + +without resorting to estimating the posteriors Q(Y=i|X), by solving a linear least-squares problem. +However, since Q(X) and Q(X|Y=i) are matrices of shape (2^K, 1) and (2^K, n), with K the number of features +and n the number of classes, their calculation becomes intractable. ReadMe instead performs bagging (i.e., it +samples small sets of features and averages the results) thus reducing K to a few terms. In our example we +set K (bagging_range) to 20, and the number of bagging_trials to 100. + +ReadMe also computes confidence intervals via bootstrap. We set the number of bootstrap trials to 100. +""" +readme = ReadMe(prob_model='full', bootstrap_trials=100, bagging_trials=100, bagging_range=20, random_state=0, verbose=True) +readme.fit(*train.Xy) # <- there is actually nothing happening here (only bootstrap resampling); the method is "lazy" + # and postpones most of the calculations to the test phase. + +# since the method is slow, we will only test 3 cases with different imbalances +few_negatives = [0.25, 0.75] +balanced = [0.5, 0.5] +few_positives = [0.75, 0.25] + +for test_prev in [few_negatives, balanced, few_positives]: + sample = reviews.test.sampling(500, *test_prev, random_state=0) # draw sets of 500 documents with desired prevs + prev_estim, conf = readme.predict_conf(sample.X) + err = qp.error.mae(sample.prevalence(), prev_estim) + print(f'true-prevalence={F.strprev(sample.prevalence())},\n' + f'predicted-prevalence={F.strprev(prev_estim)}, with confidence intervals {conf},\n' + f'MAE={err:.4f}') + + + diff --git a/examples/4.using_pretrained_classifier.py b/examples/4.using_pretrained_classifier.py new file mode 100644 index 0000000..5b5ead5 --- /dev/null +++ b/examples/4.using_pretrained_classifier.py @@ -0,0 +1,75 @@ +""" +Aggregative quantifiers use an underlying classifier. Often, one has one pre-trained classifier available, and +needs to use this classifier at the basis of a quantification system. In such cases, the classifier should not +be retrained, but only used to issue classifier predictions for the quantifier. +In this example, we show how to instantiate a quantifier with a pre-trained classifier. +""" +from typing import List, Dict + +import quapy as qp +from quapy.method.aggregative import PACC +from sklearn.base import BaseEstimator, ClassifierMixin +from transformers import pipeline +import numpy as np +import quapy.functional as F + + +# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification +class HFTextClassifier(BaseEstimator, ClassifierMixin): + def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'): + self.pipe = pipeline("sentiment-analysis", model=model_name) + self.classes_ = np.asarray([0,1]) + + def fit(self, X, y=None): + return self + + def _binary_decisions(self, transformer_output: List[Dict]): + return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int) + + def predict(self, X): + X = list(map(str, X)) + preds = self.pipe(X, truncation=True) + return self._binary_decisions(preds) + + def predict_proba(self, X): + X = list(map(str, X)) + n_examples = len(X) + preds = self.pipe(X, truncation=True) + decisions = self._binary_decisions(preds) + scores = np.array([p['score'] for p in preds], dtype=float) + probas = np.zeros(shape=(len(X), 2), dtype=float) + probas[np.arange(n_examples),decisions] = scores + probas[np.arange(n_examples),~decisions] = 1-scores + return probas + +# load a sentiment dataset +dataset = qp.datasets.fetch_reviews('imdb', tfidf=False) # raw text +train, test = dataset.training, dataset.test + +# instantiate a pre-trained classifier +clf = HFTextClassifier() + +# Let us fit a quantifier based on our pre-trained classifier. +# Note that, since the classifier is already fit, we will use the entire training set for +# learning the aggregation function of the quantifier. +# To do so, we only need to indicate "fit_classifier"=False, as follows: +quantifier = PACC(clf, fit_classifier=False) # Probabilistic Classify & Count using a pre-trained model + +print('training PACC...') +quantifier.fit(*train.Xy) + +# let us simulate some shifted test data... +new_prevalence = [0.75, 0.25] +shifted_test = test.sampling(500, *new_prevalence, random_state=0) + +# and do some evaluation +print('predicting with PACC...') +estim_prevalence = quantifier.predict(shifted_test.X) + +print('Result:\n'+('='*20)) +print(f'training prevalence: {F.strprev(train.prevalence())}') +print(f'(shifted) test prevalence: {F.strprev(shifted_test.prevalence())}') +print(f'estimated prevalence: {F.strprev(estim_prevalence)}') + +absolute_error = qp.error.ae(new_prevalence, estim_prevalence) +print(f'absolute error={absolute_error:.4f}') \ No newline at end of file diff --git a/examples/4.lequa2022_experiments.py b/examples/5a.lequa2022_experiments.py similarity index 92% rename from examples/4.lequa2022_experiments.py rename to examples/5a.lequa2022_experiments.py index 8bd9b09..40632d5 100644 --- a/examples/4.lequa2022_experiments.py +++ b/examples/5a.lequa2022_experiments.py @@ -31,13 +31,13 @@ training, val_generator, test_generator = fetch_lequa2022(task=task) Xtr, ytr = training.Xy # define the quantifier -quantifier = EMQ(classifier=LogisticRegression()) +quantifier = EMQ(classifier=LogisticRegression(), val_split=5) # model selection param_grid = { 'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength 'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class - # 'calib': ['bcts', None] # quantifier-dependent: recalibration method (new in v0.1.7) + 'calib': ['bcts', None] # quantifier-dependent: recalibration method (new in v0.1.7) } model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True) quantifier = model_selection.fit(Xtr, ytr) diff --git a/examples/4b.lequa2024_experiments.py b/examples/5b.lequa2024_experiments.py similarity index 99% rename from examples/4b.lequa2024_experiments.py rename to examples/5b.lequa2024_experiments.py index c5b6f92..351fed1 100644 --- a/examples/4b.lequa2024_experiments.py +++ b/examples/5b.lequa2024_experiments.py @@ -1,6 +1,6 @@ +import quapy as qp import numpy as np from sklearn.linear_model import LogisticRegression -import quapy as qp import quapy.functional as F from quapy.data.datasets import LEQUA2024_SAMPLE_SIZE, fetch_lequa2024 from quapy.evaluation import evaluation_report @@ -14,6 +14,7 @@ LeQua competition itself, check: https://lequa2024.github.io/index (the site of the competition) """ + # there are 4 tasks: T1 (binary), T2 (multiclass), T3 (ordinal), T4 (binary - covariate & prior shift) task = 'T2' @@ -38,6 +39,7 @@ param_grid = { 'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class 'bandwidth': np.linspace(0.01, 0.2, 20) # quantifier-dependent: bandwidth of the kernel } + model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True) quantifier = model_selection.fit(Xtr, ytr) diff --git a/examples/7.uci_experiments.py b/examples/7.uci_binary_experiments.py similarity index 76% rename from examples/7.uci_experiments.py rename to examples/7.uci_binary_experiments.py index 0c328e9..04e07ee 100644 --- a/examples/7.uci_experiments.py +++ b/examples/7.uci_binary_experiments.py @@ -1,4 +1,7 @@ from copy import deepcopy +from pathlib import Path + +import pandas as pd import quapy as qp from sklearn.calibration import CalibratedClassifierCV @@ -15,6 +18,18 @@ import itertools import argparse import torch import shutil +from glob import glob + + +""" +This example shows how to generate experiments for the UCI ML repository binary datasets following the protocol +proposed in "Pérez-Gállego , P., Quevedo , J. R., and del Coz, J. J. Using ensembles for problems with characteriz- +able changes in data distribution: A case study on quantification. Information Fusion 34 (2017), 87–100." + +This example covers most important steps in the experimentation pipeline, namely, the training and optimization +of the hyperparameters of different quantifiers, and the evaluation of these quantifiers based on standard +prevalence sampling protocols aimed at simulating different levels of prior probability shift. +""" N_JOBS = -1 @@ -28,10 +43,6 @@ def newLR(): return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) -def calibratedLR(): - return CalibratedClassifierCV(newLR()) - - __C_range = np.logspace(-3, 3, 7) lr_params = { 'classifier__C': __C_range, @@ -74,6 +85,13 @@ def result_path(path, dataset_name, model_name, run, optim_loss): return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl') +def parse_result_path(path): + *dataset, method, run, metric = Path(path).name.split('-') + dataset = '-'.join(dataset) + run = int(run.replace('run','')) + return dataset, method, run, metric + + def is_already_computed(dataset_name, model_name, run, optim_loss): return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss)) @@ -130,10 +148,28 @@ def run(experiment): best_params) +def show_results(result_folder): + result_data = [] + for file in glob(os.path.join(result_folder,'*.pkl')): + true_prevalences, estim_prevalences, *_ = pickle.load(open(file, 'rb')) + dataset, method, run, metric = parse_result_path(file) + mae = qp.error.mae(true_prevalences, estim_prevalences) + result_data.append({ + 'dataset': dataset, + 'method': method, + 'run': run, + metric: mae + }) + df = pd.DataFrame(result_data) + pd.set_option("display.max_columns", None) + pd.set_option("display.expand_frame_repr", False) + print(df.pivot_table(index='dataset', columns='method', values=metric)) + + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification') parser.add_argument('--results', metavar='RESULT_PATH', type=str, - help='path to the directory where to store the results', default='./uci_results') + help='path to the directory where to store the results', default='./results/uci_binary') parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification', help='path to the directory with svmperf') parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint', @@ -155,3 +191,5 @@ if __name__ == '__main__': qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS) shutil.rmtree(args.checkpointdir, ignore_errors=True) + + show_results(args.results) diff --git a/examples/8.ucimulti_experiments.py b/examples/8.uci_multiclass_experiments.py similarity index 88% rename from examples/8.ucimulti_experiments.py rename to examples/8.uci_multiclass_experiments.py index e2a8d97..06f7ea7 100644 --- a/examples/8.ucimulti_experiments.py +++ b/examples/8.uci_multiclass_experiments.py @@ -1,4 +1,3 @@ -import pickle import os from time import time from collections import defaultdict @@ -7,11 +6,16 @@ import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp -from quapy.method.aggregative import PACC, EMQ +from quapy.method.aggregative import PACC, EMQ, KDEyML from quapy.model_selection import GridSearchQ from quapy.protocol import UPP from pathlib import Path +""" +This example is the analogous counterpart of example 7 but involving multiclass quantification problems +using datasets from the UCI ML repository. +""" + SEED = 1 @@ -31,7 +35,7 @@ def wrap_hyper(classifier_hyper_grid:dict): METHODS = [ ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), - # ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), + ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), ] @@ -43,6 +47,7 @@ def show_results(result_path): pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True) print(pv) + def load_timings(result_path): import pandas as pd timings = defaultdict(lambda: {}) @@ -59,7 +64,7 @@ if __name__ == '__main__': qp.environ['N_JOBS'] = -1 n_bags_val = 250 n_bags_test = 1000 - result_dir = f'results/ucimulti' + result_dir = f'results/uci_multiclass' os.makedirs(result_dir, exist_ok=True) @@ -100,7 +105,7 @@ if __name__ == '__main__': t_init = time() try: - modsel.fit(train) + modsel.fit(*train.Xy) print(f'best params {modsel.best_params_}') print(f'best score {modsel.best_score_}') @@ -108,7 +113,8 @@ if __name__ == '__main__': quantifier = modsel.best_model() except: print('something went wrong... trying to fit the default model') - quantifier.fit(train) + quantifier.fit(*train.Xy) + timings[method_name][dataset] = time() - t_init diff --git a/examples/9.ifcb_experiments.py b/examples/9.ifcb_experiments.py index 8fb39d1..580be6b 100644 --- a/examples/9.ifcb_experiments.py +++ b/examples/9.ifcb_experiments.py @@ -6,6 +6,18 @@ from sklearn.linear_model import LogisticRegression from quapy.model_selection import GridSearchQ from quapy.evaluation import evaluation_report +""" +This example shows a complete experiment using the IFCB Plankton dataset; +see https://hlt-isti.github.io/QuaPy/manuals/datasets.html#ifcb-plankton-dataset + +Note that this dataset can be downloaded in two modes: for model selection or for evaluation. + +See also: +Automatic plankton quantification using deep features +P González, A Castaño, EE Peacock, J Díez, JJ Del Coz, HM Sosik +Journal of Plankton Research 41 (4), 449-463 +""" + print('Quantifying the IFCB dataset with PACC\n') @@ -30,7 +42,7 @@ mod_sel = GridSearchQ( n_jobs=-1, verbose=True, raise_errors=True -).fit(train) +).fit(*train.Xy) print(f'model selection chose hyperparameters: {mod_sel.best_params_}') quantifier = mod_sel.best_model_ @@ -42,7 +54,7 @@ print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={trai print(f'\ttest samples={test_gen.total()}') print('training on the whole dataset before test') -quantifier.fit(train) +quantifier.fit(*train.Xy) print('testing...') report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True) diff --git a/examples/ensembles.py b/examples/ensembles.py index 8c0d07e..84aeb2c 100644 --- a/examples/ensembles.py +++ b/examples/ensembles.py @@ -1,10 +1,16 @@ +from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import MultinomialNB +from sklearn.neighbors import KNeighborsClassifier from statsmodels.sandbox.distributions.genpareto import quant import quapy as qp from quapy.protocol import UPP from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML -from quapy.method.meta import SCMQ +from quapy.method.meta import SCMQ, MCMQ, MCSQ +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore", category=ConvergenceWarning) qp.environ["SAMPLE_SIZE"]=100 @@ -32,5 +38,19 @@ scmq = SCMQ(classifier, quantifiers) train_and_test_model(scmq, train, test) -for quantifier in quantifiers: - train_and_test_model(quantifier, train, test) \ No newline at end of file +# for quantifier in quantifiers: +# train_and_test_model(quantifier, train, test) + +classifiers = [ + LogisticRegression(), + KNeighborsClassifier(), + # MultinomialNB() +] + +mcmq = MCMQ(classifiers, quantifiers) + +train_and_test_model(mcmq, train, test) + +mcsq = MCSQ(classifiers, PACC()) + +train_and_test_model(mcsq, train, test) \ No newline at end of file diff --git a/experimental_non_aggregative/custom_vectorizers.py b/experimental_non_aggregative/custom_vectorizers.py new file mode 100644 index 0000000..13337b9 --- /dev/null +++ b/experimental_non_aggregative/custom_vectorizers.py @@ -0,0 +1,254 @@ +from scipy.sparse import csc_matrix, csr_matrix +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer +import numpy as np +from joblib import Parallel, delayed +import sklearn +import math +from scipy.stats import t + + +class ContTable: + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp=tp + self.tn=tn + self.fp=fp + self.fn=fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + +def __ig_factor(p_tc, p_t, p_c): + den = p_t * p_c + if den != 0.0 and p_tc != 0: + return p_tc * math.log(p_tc / den, 2) + else: + return 0.0 + + +def information_gain(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ + __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) + + +def squared_information_gain(cell): + return information_gain(cell)**2 + + +def posneg_information_gain(cell): + ig = information_gain(cell) + if cell.tpr() < cell.fpr(): + return -ig + else: + return ig + + +def pos_information_gain(cell): + if cell.tpr() < cell.fpr(): + return 0 + else: + return information_gain(cell) + +def pointwise_mutual_information(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + + +def gss(cell): + return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() + + +def chi_square(cell): + den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() + if den==0.0: return 0.0 + num = gss(cell)**2 + return num / den + + +def conf_interval(xt, n): + if n>30: + z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 + else: + z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 + p = (xt + 0.5 * z2) / (n + z2) + amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) + return p, amplitude + + +def strength(minPosRelFreq, minPos, maxNeg): + if minPos > maxNeg: + return math.log(2.0 * minPosRelFreq, 2.0) + else: + return 0.0 + + +#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) +#however, for some extremely imbalanced dataset caused all documents to be 0 +def conf_weight(cell, cancel_features=False): + c = cell.get_c() + not_c = cell.get_not_c() + tp = cell.tp + fp = cell.fp + + pos_p, pos_amp = conf_interval(tp, c) + neg_p, neg_amp = conf_interval(fp, not_c) + + min_pos = pos_p-pos_amp + max_neg = neg_p+neg_amp + den = (min_pos + max_neg) + minpos_relfreq = min_pos / (den if den != 0 else 1) + + str_tplus = strength(minpos_relfreq, min_pos, max_neg); + + if str_tplus == 0 and not cancel_features: + return 1e-20 + + return str_tplus + + +def get_tsr_matrix(cell_matrix, tsr_score_funtion): + nC = len(cell_matrix) + nF = len(cell_matrix[0]) + tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] + return np.array(tsr_matrix) + + +def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): + tp_ = len(positive_document_indexes & feature_document_indexes) + fp_ = len(feature_document_indexes - positive_document_indexes) + fn_ = len(positive_document_indexes - feature_document_indexes) + tn_ = nD - (tp_ + fp_ + fn_) + return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) + + +def category_tables(feature_sets, category_sets, c, nD, nF): + return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] + + +def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): + """ + Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. + Efficiency O(nF x nC x log(S)) where S is the sparse factor + """ + + nD, nF = coocurrence_matrix.shape + nD2, nC = label_matrix.shape + + if nD != nD2: + raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % + (coocurrence_matrix.shape,label_matrix.shape)) + + def nonzero_set(matrix, col): + return set(matrix[:, col].nonzero()[0]) + + if isinstance(coocurrence_matrix, csr_matrix): + coocurrence_matrix = csc_matrix(coocurrence_matrix) + feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] + category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] + cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")( + delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC) + ) + return np.array(cell_matrix) + + +class TSRweighting(BaseEstimator,TransformerMixin): + """ + Supervised Term Weighting function based on any Term Selection Reduction (TSR) function (e.g., information gain, + chi-square, etc.) or, more generally, on any function that could be computed on the 4-cell contingency table for + each category-feature pair. + The supervised_4cell_matrix is a `(n_classes, n_words)` matrix containing the 4-cell contingency tables + for each class-word pair, and can be pre-computed (e.g., during the feature selection phase) and passed as an + argument. + When `n_classes>1`, i.e., in multiclass scenarios, a global_policy is used in order to determine a + single feature-score which informs about its relevance. Accepted policies include "max" (takes the max score + across categories), "ave" and "wave" (take the average, or weighted average, across all categories -- weights + correspond to the class prevalence), and "sum" (which sums all category scores). + """ + + def __init__(self, tsr_function, global_policy='max', supervised_4cell_matrix=None, sublinear_tf=True, norm='l2', min_df=3, n_jobs=-1): + if global_policy not in ['max', 'ave', 'wave', 'sum']: raise ValueError('Global policy should be in {"max", "ave", "wave", "sum"}') + self.tsr_function = tsr_function + self.global_policy = global_policy + self.supervised_4cell_matrix = supervised_4cell_matrix + self.sublinear_tf = sublinear_tf + self.norm = norm + self.min_df = min_df + self.n_jobs = n_jobs + + def fit(self, X, y): + self.count_vectorizer = CountVectorizer(min_df=self.min_df) + X = self.count_vectorizer.fit_transform(X) + + self.tf_vectorizer = TfidfTransformer( + norm=None, use_idf=False, smooth_idf=False, sublinear_tf=self.sublinear_tf + ).fit(X) + + if len(y.shape) == 1: + y = np.expand_dims(y, axis=1) + + nD, nC = y.shape + nF = len(self.tf_vectorizer.get_feature_names_out()) + + if self.supervised_4cell_matrix is None: + self.supervised_4cell_matrix = get_supervised_matrix(X, y, n_jobs=self.n_jobs) + else: + if self.supervised_4cell_matrix.shape != (nC, nF): + raise ValueError("Shape of supervised information matrix is inconsistent with X and y") + + tsr_matrix = get_tsr_matrix(self.supervised_4cell_matrix, self.tsr_function) + + if self.global_policy == 'ave': + self.global_tsr_vector = np.average(tsr_matrix, axis=0) + elif self.global_policy == 'wave': + category_prevalences = [sum(y[:,c])*1.0/nD for c in range(nC)] + self.global_tsr_vector = np.average(tsr_matrix, axis=0, weights=category_prevalences) + elif self.global_policy == 'sum': + self.global_tsr_vector = np.sum(tsr_matrix, axis=0) + elif self.global_policy == 'max': + self.global_tsr_vector = np.amax(tsr_matrix, axis=0) + return self + + def fit_transform(self, X, y): + return self.fit(X,y).transform(X) + + def transform(self, X): + if not hasattr(self, 'global_tsr_vector'): raise NameError('TSRweighting: transform method called before fit.') + X = self.count_vectorizer.transform(X) + tf_X = self.tf_vectorizer.transform(X).toarray() + weighted_X = np.multiply(tf_X, self.global_tsr_vector) + if self.norm is not None and self.norm!='none': + weighted_X = sklearn.preprocessing.normalize(weighted_X, norm=self.norm, axis=1, copy=False) + return csr_matrix(weighted_X) diff --git a/experimental_non_aggregative/method_dxs.py b/experimental_non_aggregative/method_dxs.py new file mode 100644 index 0000000..93fb67e --- /dev/null +++ b/experimental_non_aggregative/method_dxs.py @@ -0,0 +1,208 @@ +from scipy.sparse import issparse +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler + +import quapy as qp +from data import LabelledCollection +import numpy as np + +from experimental_non_aggregative.custom_vectorizers import * +from method._kdey import KDEBase +from protocol import APP +from quapy.method.aggregative import HDy, DistributionMatchingY +from quapy.method.base import BaseQuantifier +from scipy import optimize +import pandas as pd +import quapy.functional as F + + +# TODO: explore the bernoulli (term presence/absence) variant +# TODO: explore the multinomial (term frequency) variant +# TODO: explore the multinomial + length normalization variant +# TODO: consolidate the TSR-variant (e.g., using information gain) variant; +# - works better with the idf? +# - works better with length normalization? +# - etc + +class DxS(BaseQuantifier): + def __init__(self, vectorizer=None, divergence='topsoe'): + self.vectorizer = vectorizer + self.divergence = divergence + + # def __as_distribution(self, instances): + # return np.asarray(instances.sum(axis=0) / instances.sum()).flatten() + + def __as_distribution(self, instances): + dist = instances.mean(axis=0) + return np.asarray(dist).flatten() + + def fit(self, text_instances, labels): + + classes = np.unique(labels) + + if self.vectorizer is not None: + text_instances = self.vectorizer.fit_transform(text_instances, y=labels) + + distributions = [] + for class_i in classes: + distributions.append(self.__as_distribution(text_instances[labels == class_i])) + + self.validation_distribution = np.asarray(distributions) + + return self + + def predict(self, text_instances): + if self.vectorizer is not None: + text_instances = self.vectorizer.transform(text_instances) + + test_distribution = self.__as_distribution(text_instances) + divergence = qp.functional.get_divergence(self.divergence) + n_classes, n_feats = self.validation_distribution.shape + + def match(prev): + prev = np.expand_dims(prev, axis=0) + mixture_distribution = (prev @ self.validation_distribution).flatten() + return divergence(test_distribution, mixture_distribution) + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x + + + +class KDExML(BaseQuantifier, KDEBase): + + def __init__(self, bandwidth=0.1, standardize=False): + self._check_bandwidth(bandwidth) + self.bandwidth = bandwidth + self.standardize = standardize + + def fit(self, X, y): + classes = sorted(np.unique(y)) + + if self.standardize: + self.scaler = StandardScaler() + X = self.scaler.fit_transform(X) + + if issparse(X): + X = X.toarray() + + self.mix_densities = self.get_mixture_components(X, y, classes, self.bandwidth) + return self + + def predict(self, X): + """ + Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood + of the data (i.e., that minimizes the negative log-likelihood) + + :param X: instances in the sample + :return: a vector of class prevalence estimates + """ + epsilon = 1e-10 + if issparse(X): + X = X.toarray() + n_classes = len(self.mix_densities) + if self.standardize: + X = self.scaler.transform(X) + test_densities = [self.pdf(kde_i, X) for kde_i in self.mix_densities] + + def neg_loglikelihood(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + return F.optim_minimize(neg_loglikelihood, n_classes) + + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 250 + qp.environ['N_JOBS'] = -1 + min_df = 10 + # dataset = 'imdb' + repeats = 10 + error = 'mae' + + div = 'topsoe' + + # generates tuples (dataset, method, method_name) + # (the dataset is needed for methods that process the dataset differently) + def gen_methods(): + + for dataset in qp.datasets.REVIEWS_SENTIMENT_DATASETS: + + data = qp.datasets.fetch_reviews(dataset, tfidf=False) + + # bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True) + # dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer) + # yield data, dxs, 'DxS-Bernoulli' + # + # multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False) + # dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer) + # yield data, dxs, 'DxS-multinomial' + # + # tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=tf_vectorizer) + # yield data, dxs, 'DxS-TF' + # + # logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=logtf_vectorizer) + # yield data, dxs, 'DxS-logTF' + # + # tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer) + # yield data, dxs, 'DxS-TFIDF' + # + # tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2') + # dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer) + # yield data, dxs, 'DxS-TFIDF-l2' + + tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2') + dxs = DxS(divergence=div, vectorizer=tsr_vectorizer) + yield data, dxs, 'DxS-TFTSR-l2' + + data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df) + + kdex = KDExML() + reduction = TruncatedSVD(n_components=100, random_state=0) + red_data = qp.data.preprocessing.instance_transformation(data, transformer=reduction, inplace=False) + yield red_data, kdex, 'KDEx' + + hdy = HDy(LogisticRegression()) + yield data, hdy, 'HDy' + + # dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5) + # yield data, dm, 'DM-5b' + # + # dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10) + # yield data, dm, 'DM-10b' + + + + + result_path = 'results.csv' + with open(result_path, 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\n') + for data, quantifier, quant_name in gen_methods(): + quantifier.fit(*data.training.Xy) + report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True) + means = report.mean(numeric_only=True) + csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') + + df = pd.read_csv(result_path, sep='\t') + # print(df) + + pv = df.pivot_table(index='Method', columns="Dataset", values=["MAE", "MRAE"]) + print(pv) + + + + diff --git a/logo/LogoQuaDaSh.png b/logo/LogoQuaDaSh.png new file mode 100644 index 0000000..5daff40 Binary files /dev/null and b/logo/LogoQuaDaSh.png differ diff --git a/logo/NextGenerationEU.jpg b/logo/NextGenerationEU.jpg new file mode 100644 index 0000000..c377101 Binary files /dev/null and b/logo/NextGenerationEU.jpg differ diff --git a/prepare_svmperf.sh b/prepare_svmperf.sh index b609f6c..3da8bfe 100755 --- a/prepare_svmperf.sh +++ b/prepare_svmperf.sh @@ -11,13 +11,5 @@ rm $FILE patch -s -p0 < svm-perf-quantification-ext.patch mv svm_perf svm_perf_quantification cd svm_perf_quantification -make - - - - - - - - +make CFLAGS="-O3 -Wall -Wno-unused-result -fcommon" diff --git a/quapy/__init__.py b/quapy/__init__.py index 90f7a70..a952fbc 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,5 +1,4 @@ """QuaPy module for quantification""" -from sklearn.linear_model import LogisticRegression from quapy.data import datasets from . import error @@ -14,7 +13,13 @@ from . import model_selection from . import classification import os -__version__ = '0.2.0' +__version__ = '0.2.1' + + +def _default_cls(): + from sklearn.linear_model import LogisticRegression + return LogisticRegression() + environ = { 'SAMPLE_SIZE': None, @@ -24,7 +29,7 @@ environ = { 'PAD_INDEX': 1, 'SVMPERF_HOME': './svm_perf_quantification', 'N_JOBS': int(os.getenv('N_JOBS', 1)), - 'DEFAULT_CLS': LogisticRegression() + 'DEFAULT_CLS': _default_cls() } @@ -68,3 +73,5 @@ def _get_classifier(classifier): if classifier is None: raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified') return classifier + + diff --git a/quapy/classification/svmperf.py b/quapy/classification/svmperf.py index 6c85084..71f2ac3 100644 --- a/quapy/classification/svmperf.py +++ b/quapy/classification/svmperf.py @@ -33,27 +33,16 @@ class SVMperf(BaseEstimator, ClassifierMixin): valid_losses = {'01':0, 'f1':1, 'kld':12, 'nkld':13, 'q':22, 'qacc':23, 'qf1':24, 'qgm':25, 'mae':26, 'mrae':27} def __init__(self, svmperf_base, C=0.01, verbose=False, loss='01', host_folder=None): - assert exists(svmperf_base), f'path {svmperf_base} does not seem to point to a valid path' + assert exists(svmperf_base), \ + (f'path {svmperf_base} does not seem to point to a valid path;' + f'did you install svm-perf? ' + f'see instructions in https://hlt-isti.github.io/QuaPy/manuals/explicit-loss-minimization.html') self.svmperf_base = svmperf_base self.C = C self.verbose = verbose self.loss = loss self.host_folder = host_folder - # def set_params(self, **parameters): - # """ - # Set the hyper-parameters for svm-perf. Currently, only the `C` and `loss` parameters are supported - # - # :param parameters: a `**kwargs` dictionary `{'C': }` - # """ - # assert sorted(list(parameters.keys())) == ['C', 'loss'], \ - # 'currently, only the C and loss parameters are supported' - # self.C = parameters.get('C', self.C) - # self.loss = parameters.get('loss', self.loss) - # - # def get_params(self, deep=True): - # return {'C': self.C, 'loss': self.loss} - def fit(self, X, y): """ Trains the SVM for the multivariate performance loss diff --git a/quapy/data/base.py b/quapy/data/base.py index b0fa779..6db182a 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -9,7 +9,7 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from numpy.random import RandomState from quapy.functional import strprev from quapy.util import temp_seed -import functional as F +import quapy.functional as F class LabelledCollection: @@ -33,7 +33,6 @@ class LabelledCollection: else: self.instances = np.asarray(instances) self.labels = np.asarray(labels) - n_docs = len(self) if classes is None: self.classes_ = F.classes_from_labels(self.labels) else: @@ -41,7 +40,13 @@ class LabelledCollection: self.classes_.sort() if len(set(self.labels).difference(set(classes))) > 0: raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})') - self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} + self._index = None + + @property + def index(self): + if self._index is None: + self._index = {class_: np.arange(len(self))[self.labels == class_] for class_ in self.classes_} + return self._index @classmethod def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs): diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index a6760a3..c08748f 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -114,7 +114,8 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle """ Loads a Reviews dataset as a Dataset instance, as used in `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." - Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. `_. + Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. + `_. The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS` :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb' @@ -499,7 +500,7 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, standardize= y = df["NSP"].astype(int).values elif group == "semeion": with download_tmp_file("semeion", "semeion.data") as tmp: - df = pd.read_csv(tmp, header=None, delim_whitespace=True) + df = pd.read_csv(tmp, header=None, sep='\s+') X = df.iloc[:, 0:256].astype(float).values y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266) else: diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index a4be4fd..5f7e0a9 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -10,6 +10,37 @@ from quapy.util import map_parallel from .base import LabelledCollection +def instance_transformation(dataset:Dataset, transformer, inplace=False): + """ + Transforms a :class:`quapy.data.base.Dataset` applying the `fit_transform` and `transform` functions + of a (sklearn's) transformer. + + :param dataset: a :class:`quapy.data.base.Dataset` where the instances of training and test collections are + lists of str + :param transformer: TransformerMixin implementing `fit_transform` and `transform` functions + :param inplace: whether or not to apply the transformation inplace (True), or to a new copy (False, default) + :return: a new :class:`quapy.data.base.Dataset` with transformed instances (if inplace=False) or a reference to the + current Dataset (if inplace=True) where the instances have been transformed + """ + training_transformed = transformer.fit_transform(*dataset.training.Xy) + test_transformed = transformer.transform(dataset.test.X) + orig_name = dataset.name + + if inplace: + dataset.training = LabelledCollection(training_transformed, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_transformed, dataset.test.labels, dataset.classes_) + if hasattr(transformer, 'vocabulary_'): + dataset.vocabulary = transformer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_transformed, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_transformed, dataset.test.labels.copy(), dataset.classes_) + vocab = None + if hasattr(transformer, 'vocabulary_'): + vocab = transformer.vocabulary_ + return Dataset(training, test, vocabulary=vocab, name=orig_name) + + def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): """ Transforms a :class:`quapy.data.base.Dataset` of textual instances into a :class:`quapy.data.base.Dataset` of @@ -29,18 +60,7 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw __check_type(dataset.test.instances, np.ndarray, str) vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs) - training_documents = vectorizer.fit_transform(dataset.training.instances) - test_documents = vectorizer.transform(dataset.test.instances) - - if inplace: - dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_) - dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_) - dataset.vocabulary = vectorizer.vocabulary_ - return dataset - else: - training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_) - test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_) - return Dataset(training, test, vectorizer.vocabulary_) + return instance_transformation(dataset, vectorizer, inplace) def reduce_columns(dataset: Dataset, min_df=5, inplace=False): diff --git a/quapy/error.py b/quapy/error.py index 201ab8f..eb42cd6 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -45,89 +45,95 @@ def acce(y_true, y_pred): return 1. - (y_true == y_pred).mean() -def mae(prevs, prevs_hat): +def mae(prevs_true, prevs_hat): """Computes the mean absolute error (see :meth:`quapy.error.ae`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean absolute error """ - return ae(prevs, prevs_hat).mean() + return ae(prevs_true, prevs_hat).mean() -def ae(prevs, prevs_hat): +def ae(prevs_true, prevs_hat): """Computes the absolute error between the two prevalence vectors. Absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`AE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}|\\hat{p}(y)-p(y)|`, where :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: absolute error """ - assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' - return abs(prevs_hat - prevs).mean(axis=-1) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs_true).mean(axis=-1) -def nae(prevs, prevs_hat): +def nae(prevs_true, prevs_hat): """Computes the normalized absolute error between the two prevalence vectors. Normalized absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`NAE(p,\\hat{p})=\\frac{AE(p,\\hat{p})}{z_{AE}}`, where :math:`z_{AE}=\\frac{2(1-\\min_{y\\in \\mathcal{Y}} p(y))}{|\\mathcal{Y}|}`, and :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: normalized absolute error """ - assert prevs.shape == prevs_hat.shape, f'wrong shape {prevs.shape} vs. {prevs_hat.shape}' - return abs(prevs_hat - prevs).sum(axis=-1)/(2*(1-prevs.min(axis=-1))) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape == prevs_hat.shape, f'wrong shape {prevs_true.shape} vs. {prevs_hat.shape}' + return abs(prevs_hat - prevs_true).sum(axis=-1)/(2 * (1 - prevs_true.min(axis=-1))) -def mnae(prevs, prevs_hat): +def mnae(prevs_true, prevs_hat): """Computes the mean normalized absolute error (see :meth:`quapy.error.nae`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean normalized absolute error """ - return nae(prevs, prevs_hat).mean() + return nae(prevs_true, prevs_hat).mean() -def mse(prevs, prevs_hat): +def mse(prevs_true, prevs_hat): """Computes the mean squared error (see :meth:`quapy.error.se`) across the sample pairs. - :param prevs: array-like of shape `(n_samples, n_classes,)` with the + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: mean squared error """ - return se(prevs, prevs_hat).mean() + return se(prevs_true, prevs_hat).mean() -def se(prevs, prevs_hat): +def se(prevs_true, prevs_hat): """Computes the squared error between the two prevalence vectors. Squared error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as :math:`SE(p,\\hat{p})=\\frac{1}{|\\mathcal{Y}|}\\sum_{y\\in \\mathcal{Y}}(\\hat{p}(y)-p(y))^2`, where :math:`\\mathcal{Y}` are the classes of interest. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: absolute error """ - return ((prevs_hat - prevs) ** 2).mean(axis=-1) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + return ((prevs_hat - prevs_true) ** 2).mean(axis=-1) -def mkld(prevs, prevs_hat, eps=None): +def mkld(prevs_true, prevs_hat, eps=None): """Computes the mean Kullback-Leibler divergence (see :meth:`quapy.error.kld`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -137,10 +143,10 @@ def mkld(prevs, prevs_hat, eps=None): (which has thus to be set beforehand). :return: mean Kullback-Leibler distribution """ - return kld(prevs, prevs_hat, eps).mean() + return kld(prevs_true, prevs_hat, eps).mean() -def kld(prevs, prevs_hat, eps=None): +def kld(prevs_true, prevs_hat, eps=None): """Computes the Kullback-Leibler divergence between the two prevalence distributions. Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` is computed as @@ -149,7 +155,7 @@ def kld(prevs, prevs_hat, eps=None): where :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. KLD is not defined in cases in which the distributions contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample size. @@ -158,17 +164,17 @@ def kld(prevs, prevs_hat, eps=None): :return: Kullback-Leibler divergence between the two distributions """ eps = __check_eps(eps) - smooth_prevs = smooth(prevs, eps) + smooth_prevs = smooth(prevs_true, eps) smooth_prevs_hat = smooth(prevs_hat, eps) return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1) -def mnkld(prevs, prevs_hat, eps=None): +def mnkld(prevs_true, prevs_hat, eps=None): """Computes the mean Normalized Kullback-Leibler divergence (see :meth:`quapy.error.nkld`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain @@ -177,10 +183,10 @@ def mnkld(prevs, prevs_hat, eps=None): (which has thus to be set beforehand). :return: mean Normalized Kullback-Leibler distribution """ - return nkld(prevs, prevs_hat, eps).mean() + return nkld(prevs_true, prevs_hat, eps).mean() -def nkld(prevs, prevs_hat, eps=None): +def nkld(prevs_true, prevs_hat, eps=None): """Computes the Normalized Kullback-Leibler divergence between the two prevalence distributions. Normalized Kullback-Leibler divergence between two prevalence distributions :math:`p` and :math:`\\hat{p}` is computed as @@ -189,7 +195,7 @@ def nkld(prevs, prevs_hat, eps=None): :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. NKLD is not defined in cases in which the distributions contain zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the sample @@ -197,16 +203,16 @@ def nkld(prevs, prevs_hat, eps=None): `SAMPLE_SIZE` (which has thus to be set beforehand). :return: Normalized Kullback-Leibler divergence between the two distributions """ - ekld = np.exp(kld(prevs, prevs_hat, eps)) + ekld = np.exp(kld(prevs_true, prevs_hat, eps)) return 2. * ekld / (1 + ekld) - 1. -def mrae(prevs, prevs_hat, eps=None): +def mrae(prevs_true, prevs_hat, eps=None): """Computes the mean relative absolute error (see :meth:`quapy.error.rae`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -216,10 +222,10 @@ def mrae(prevs, prevs_hat, eps=None): the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). :return: mean relative absolute error """ - return rae(prevs, prevs_hat, eps).mean() + return rae(prevs_true, prevs_hat, eps).mean() -def rae(prevs, prevs_hat, eps=None): +def rae(prevs_true, prevs_hat, eps=None): """Computes the absolute relative error between the two prevalence vectors. Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as @@ -228,7 +234,7 @@ def rae(prevs, prevs_hat, eps=None): where :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. `rae` is not defined in cases in which the true distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the @@ -237,12 +243,12 @@ def rae(prevs, prevs_hat, eps=None): :return: relative absolute error """ eps = __check_eps(eps) - prevs = smooth(prevs, eps) + prevs_true = smooth(prevs_true, eps) prevs_hat = smooth(prevs_hat, eps) - return (abs(prevs - prevs_hat) / prevs).mean(axis=-1) + return (abs(prevs_true - prevs_hat) / prevs_true).mean(axis=-1) -def nrae(prevs, prevs_hat, eps=None): +def nrae(prevs_true, prevs_hat, eps=None): """Computes the normalized absolute relative error between the two prevalence vectors. Relative absolute error between two prevalence vectors :math:`p` and :math:`\\hat{p}` is computed as @@ -252,7 +258,7 @@ def nrae(prevs, prevs_hat, eps=None): and :math:`\\mathcal{Y}` are the classes of interest. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :param eps: smoothing factor. `nrae` is not defined in cases in which the true distribution contains zeros; `eps` is typically set to be :math:`\\frac{1}{2T}`, with :math:`T` the @@ -261,18 +267,18 @@ def nrae(prevs, prevs_hat, eps=None): :return: normalized relative absolute error """ eps = __check_eps(eps) - prevs = smooth(prevs, eps) + prevs_true = smooth(prevs_true, eps) prevs_hat = smooth(prevs_hat, eps) - min_p = prevs.min(axis=-1) - return (abs(prevs - prevs_hat) / prevs).sum(axis=-1)/(prevs.shape[-1]-1+(1-min_p)/min_p) + min_p = prevs_true.min(axis=-1) + return (abs(prevs_true - prevs_hat) / prevs_true).sum(axis=-1)/(prevs_true.shape[-1] - 1 + (1 - min_p) / min_p) -def mnrae(prevs, prevs_hat, eps=None): +def mnrae(prevs_true, prevs_hat, eps=None): """Computes the mean normalized relative absolute error (see :meth:`quapy.error.nrae`) across the sample pairs. The distributions are smoothed using the `eps` factor (see :meth:`quapy.error.smooth`). - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values @@ -282,57 +288,61 @@ def mnrae(prevs, prevs_hat, eps=None): the environment variable `SAMPLE_SIZE` (which has thus to be set beforehand). :return: mean normalized relative absolute error """ - return nrae(prevs, prevs_hat, eps).mean() + return nrae(prevs_true, prevs_hat, eps).mean() -def nmd(prevs, prevs_hat): +def nmd(prevs_true, prevs_hat): """ Computes the Normalized Match Distance; which is the Normalized Distance multiplied by the factor `1/(n-1)` to guarantee the measure ranges between 0 (best prediction) and 1 (worst prediction). - :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values :return: float in [0,1] """ - n = prevs.shape[-1] - return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat)) + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + n = prevs_true.shape[-1] + return (1./(n-1))*np.mean(match_distance(prevs_true, prevs_hat)) -def bias_binary(prevs, prevs_hat): +def bias_binary(prevs_true, prevs_hat): """ Computes the (positive) bias in a binary problem. The bias is simply the difference between the predicted positive value and the true positive value, so that a positive such value indicates the prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise. :math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`, - :param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_samples, n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted prevalence values :return: binary bias """ - assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems' - return prevs_hat[...,1]-prevs[...,1] + prevs_true = np.asarray(prevs_true) + prevs_hat = np.asarray(prevs_hat) + assert prevs_true.shape[-1] == 2 and prevs_true.shape[-1] == 2, f'bias_binary can only be applied to binary problems' + return prevs_hat[...,1]-prevs_true[...,1] -def mean_bias_binary(prevs, prevs_hat): +def mean_bias_binary(prevs_true, prevs_hat): """ Computes the mean of the (positive) bias in a binary problem. - :param prevs: array-like of shape `(n_classes,)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values :return: mean binary bias """ - return np.mean(bias_binary(prevs, prevs_hat)) + return np.mean(bias_binary(prevs_true, prevs_hat)) -def md(prevs, prevs_hat, ERROR_TOL=1E-3): +def md(prevs_true, prevs_hat, ERROR_TOL=1E-3): """ Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in all cases. - :param prevs: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values + :param prevs_true: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the true prevalence values :param prevs_hat: array-like of shape `(n_classes,)` or `(n_instances, n_classes)` with the predicted prevalence values :return: float """ - P = np.cumsum(prevs, axis=-1) + P = np.cumsum(prevs_true, axis=-1) P_hat = np.cumsum(prevs_hat, axis=-1) assert np.all(np.isclose(P_hat[..., -1], 1.0, rtol=ERROR_TOL)), \ 'arg error in match_distance: the array does not represent a valid distribution' @@ -349,6 +359,7 @@ def smooth(prevs, eps): :param eps: smoothing factor :return: array-like of shape `(n_classes,)` with the smoothed distribution """ + prevs = np.asarray(prevs) n_classes = prevs.shape[-1] return (prevs + eps) / (eps * n_classes + 1) diff --git a/quapy/functional.py b/quapy/functional.py index 2e477e0..408c62a 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -439,7 +439,7 @@ def argmin_prevalence(loss: Callable, raise NotImplementedError() -def optim_minimize(loss: Callable, n_classes: int): +def optim_minimize(loss: Callable, n_classes: int, return_loss=False): """ Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's @@ -447,18 +447,24 @@ def optim_minimize(loss: Callable, n_classes: int): :param loss: (callable) the function to minimize :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector - :return: (ndarray) the best prevalence vector found + :param return_loss: bool, if True, returns also the value of the loss (default is False). + :return: (ndarray) the best prevalence vector found or a tuple which also contains the value of the loss + if return_loss=True """ from scipy import optimize # the initial point is set as the uniform distribution - uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + uniform_distribution = uniform_prevalence(n_classes=n_classes) # solutions are bounded to those contained in the unit-simplex bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) - return r.x + + if return_loss: + return r.x, r.fun + else: + return r.x def linear_search(loss: Callable, n_classes: int): diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index f352ca5..4c2ec1c 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -1,3 +1,7 @@ +import warnings +from sklearn.exceptions import ConvergenceWarning +warnings.simplefilter("ignore", ConvergenceWarning) + from . import confidence from . import base from . import aggregative @@ -23,7 +27,8 @@ AGGREGATIVE_METHODS = { aggregative.KDEyML, aggregative.KDEyCS, aggregative.KDEyHD, - confidence.BayesianCC + # aggregative.OneVsAllAggregative, + confidence.BayesianCC, } BINARY_METHODS = { @@ -63,3 +68,5 @@ QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS | META_ME + + diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index f941e30..5f80f8d 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -1,10 +1,8 @@ -from typing import Union import numpy as np from sklearn.base import BaseEstimator from sklearn.neighbors import KernelDensity import quapy as qp -from quapy.data import LabelledCollection from quapy.method.aggregative import AggregativeSoftQuantifier import quapy.functional as F diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 9b85650..25fc1ef 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from argparse import ArgumentError from copy import deepcopy from typing import Callable, Literal, Union import numpy as np @@ -19,6 +20,10 @@ from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric from quapy.method import _bayesian +# import warnings +# from sklearn.exceptions import ConvergenceWarning +# warnings.filterwarnings("ignore", category=ConvergenceWarning) + # Abstract classes # ------------------------------------ @@ -51,7 +56,11 @@ class AggregativeQuantifier(BaseQuantifier, ABC): the training data be wasted. """ - def __init__(self, classifier: Union[None,BaseEstimator], fit_classifier:bool=True, val_split:Union[int,float,tuple,None]=5): + def __init__(self, + classifier: Union[None,BaseEstimator], + fit_classifier:bool=True, + val_split:Union[int,float,tuple,None]=5): + self.classifier = qp._get_classifier(classifier) self.fit_classifier = fit_classifier self.val_split = val_split @@ -63,6 +72,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC): assert isinstance(fit_classifier, bool), \ f'unexpected type for {fit_classifier=}; must be True or False' + # val_split is indicated as a number of folds for cross-validation if isinstance(val_split, int): assert val_split > 1, \ (f'when {val_split=} is indicated as an integer, it represents the number of folds in a kFCV ' @@ -75,12 +85,14 @@ class AggregativeQuantifier(BaseQuantifier, ABC): if val_split!=5: assert fit_classifier, (f'Parameter {val_split=} has been modified, but {fit_classifier=} ' f'indicates the classifier should not be retrained.') + # val_split is indicated as a fraction of validation instances elif isinstance(val_split, float): assert 0 < val_split < 1, \ (f'when {val_split=} is indicated as a float, it represents the fraction of training instances ' f'to be used for validation, and must thus be in the range (0,1)') assert fit_classifier, (f'when {val_split=} is indicated as a float (the fraction of training instances ' f'to be used for validation), the parameter {fit_classifier=} must be True') + # val_split is indicated as a validation collection (X,y) elif isinstance(val_split, tuple): assert len(val_split) == 2, \ (f'when {val_split=} is indicated as a tuple, it represents the collection (X,y) on which the ' @@ -161,7 +173,9 @@ class AggregativeQuantifier(BaseQuantifier, ABC): assert self.fit_classifier, f'{self.__class__}: unexpected value for {self.fit_classifier=}' num_folds = self.val_split n_jobs = self.n_jobs if hasattr(self, 'n_jobs') else qp._get_njobs(None) - predictions = cross_val_predict(self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method()) + predictions = cross_val_predict( + self.classifier, X, y, cv=num_folds, n_jobs=n_jobs, method=self._classifier_method() + ) labels = y self.classifier.fit(X, y) elif isinstance(self.val_split, float): @@ -672,26 +686,26 @@ class EMQ(AggregativeSoftQuantifier): :param classifier: a scikit-learn's BaseEstimator, or None, in which case the classifier is taken to be the one indicated in `qp.environ['DEFAULT_CLS']` - :param fit_classifier: whether to train the learner (default is True). Set to False if the - learner has been trained outside the quantifier. + :param fit_classifier: whether to train the classifier (default is True). Set to False if the + given classifier has already been trained. - :param val_split: specifies the data used for generating classifier predictions. This specification - can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to - be extracted from the training set; or as an integer (default 5), indicating that the predictions - are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value - for `k`); or as a tuple (X,y) defining the specific set of data to use for validation. - This hyperparameter is only meant to be used when the heuristics are to be applied, i.e., if a - calibration is required. The default value is None (meaning the calibration is not required). In - case this hyperparameter is set to a value other than None, but the calibration is not required - (calib=None), a warning message will be raised. + :param val_split: specifies the data used for generating the classifier predictions on which the + aggregation function is to be trained. This specification can be made as float in (0, 1) indicating + the proportion of stratified held-out validation set to be extracted from the training set; or as + an integer (default 5), indicating that the predictions are to be generated in a `k`-fold + cross-validation manner (with this integer indicating the value for `k`); or as a tuple (X,y) defining + the specific set of data to use for validation. This hyperparameter is only meant to be used when + the heuristics are to be applied, i.e., if a calibration is required. The default value is None + (meaning the calibration is not required). In case this hyperparameter is set to a value other than + None, but the calibration is not required (calib=None), a warning message will be raised. - :param exact_train_prev: set to True (default) for using the true training prevalence as the initial observation; - set to False for computing the training prevalence as an estimate of it, i.e., as the expected - value of the posterior probabilities of the training instances. + :param exact_train_prev: set to True (default) for using the true training prevalence as the initial + observation; set to False for computing the training prevalence as an estimate of it, i.e., as the + expected value of the posterior probabilities of the training instances. :param calib: a string indicating the method of calibration. - Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling, - default), "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no calibration). + Available choices include "nbvs" (No-Bias Vector Scaling), "bcts" (Bias-Corrected Temperature Scaling), + "ts" (Temperature Scaling), and "vs" (Vector Scaling). Default is None (no calibration). :param on_calib_error: a string indicating the policy to follow in case the calibrator fails at runtime. Options include "raise" (default), in which case a RuntimeException is raised; and "backup", in which @@ -756,8 +770,8 @@ class EMQ(AggregativeSoftQuantifier): if self.val_split is not None: if self.exact_train_prev and self.calib is None: raise RuntimeWarning(f'The parameter {self.val_split=} was specified for EMQ, while the parameters ' - f'{self.exact_train_prev=} and {self.calib=}. This has no effect and causes an unnecessary ' - f'overload.') + f'{self.exact_train_prev=} and {self.calib=}. This has no effect and causes an ' + f'unnecessary overload.') else: if self.calib is not None: print(f'[warning] The parameter {self.calib=} requires the val_split be different from None. ' @@ -784,8 +798,6 @@ class EMQ(AggregativeSoftQuantifier): def _fit_calibration(self, calibrator, P, y): n_classes = len(self.classes_) - print(y, 'Y') - print(y.dtype, 'DTYPE') if not np.issubdtype(y.dtype, np.number): y = np.searchsorted(self.classes_, y) @@ -823,6 +835,19 @@ class EMQ(AggregativeSoftQuantifier): """ P = classif_predictions y = labels + + requires_predictions = (self.calib is not None) or (not self.exact_train_prev) + if P is None and requires_predictions: + # classifier predictions were not generated because val_split=None + raise ArgumentError(self.val_split, self.__class__.__name__ + + ": Classifier predictions for the aggregative fit were not generated because " + "val_split=None. This usually happens when you enable calibrations or heuristics " + "during model selection but left val_split set to its default value (None). " + "Please provide one of the following values for val_split: (i) an integer >1 " + "(e.g. val_split=5) for k-fold cross-validation; (ii) a float in (0,1) (e.g. " + "val_split=0.3) for a proportion split; or (iii) a tuple (X, y) with explicit " + "validation data") + if self.calib is not None: calibrator = { 'nbvs': NoBiasVectorScaling(), @@ -1381,18 +1406,20 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier): `Gao and Sebastiani, 2016 `_. :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a - one-vs-all manner + one-vs-all manner (default PACC(LogitsticRegression())) :param n_jobs: number of parallel workers :param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers (e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will is removed and no longer available at predict time. """ - def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='multiprocessing'): + def __init__(self, binary_quantifier=None, n_jobs=None, parallel_backend='multiprocessing'): + if binary_quantifier is None: + binary_quantifier = PACC() assert isinstance(binary_quantifier, BaseQuantifier), \ - f'{self.binary_quantifier} does not seem to be a Quantifier' + f'{binary_quantifier} does not seem to be a Quantifier' assert isinstance(binary_quantifier, AggregativeQuantifier), \ - f'{self.binary_quantifier} does not seem to be of type Aggregative' + f'{binary_quantifier} does not seem to be of type Aggregative' self.binary_quantifier = binary_quantifier self.n_jobs = qp._get_njobs(n_jobs) self.parallel_backend = parallel_backend diff --git a/quapy/method/composable.py b/quapy/method/composable.py index 3aacab6..c40e3bb 100644 --- a/quapy/method/composable.py +++ b/quapy/method/composable.py @@ -1,27 +1,28 @@ """This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold.""" -__install_istructions = """ +from dataclasses import dataclass +from packaging.version import Version + +from .base import BaseQuantifier + +# what to display when an ImportError is thrown +_IMPORT_ERROR_MESSAGE = """qunfold, the back-end of quapy.method.composable, is not properly installed. + To fix this error, call: pip install --upgrade pip setuptools wheel pip install "jax[cpu]" pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.5" """ -__import_error_message = ( - "qunfold, the back-end of quapy.method.composable, is not properly installed." + __install_istructions -) -__old_version_message = ( - "The version of qunfold you have installed is not compatible with current quapy's version, " - "which requires qunfold>=0.1.5. " + __install_istructions -) - -from packaging.version import Version +# try to import members of qunfold as members of this module try: import qunfold - from qunfold.quapy import QuaPyWrapper + from qunfold.base import BaseMixin + from qunfold.methods import AbstractMethod from qunfold.sklearn import CVClassifier from qunfold import ( + LinearMethod, # methods LeastSquaresLoss, # losses BlobelLoss, EnergyLoss, @@ -29,37 +30,38 @@ try: CombinedLoss, TikhonovRegularization, TikhonovRegularized, - ClassTransformer, # transformers - HistogramTransformer, - DistanceTransformer, - KernelTransformer, - EnergyKernelTransformer, - LaplacianKernelTransformer, - GaussianKernelTransformer, - GaussianRFFKernelTransformer, + ClassRepresentation, # representations + HistogramRepresentation, + DistanceRepresentation, + KernelRepresentation, + EnergyKernelRepresentation, + LaplacianKernelRepresentation, + GaussianKernelRepresentation, + GaussianRFFKernelRepresentation, ) - - __all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper - "ComposableQuantifier", - "CVClassifier", - "LeastSquaresLoss", - "BlobelLoss", - "EnergyLoss", - "HellingerSurrogateLoss", - "CombinedLoss", - "TikhonovRegularization", - "TikhonovRegularized", - "ClassTransformer", - "HistogramTransformer", - "DistanceTransformer", - "KernelTransformer", - "EnergyKernelTransformer", - "LaplacianKernelTransformer", - "GaussianKernelTransformer", - "GaussianRFFKernelTransformer", - ] except ImportError as e: - raise ImportError(__import_error_message) from e + raise ImportError(_IMPORT_ERROR_MESSAGE) from e + +__all__ = [ # control public members, e.g., for auto-documentation in sphinx + "QUnfoldWrapper", + "ComposableQuantifier", + "CVClassifier", + "LeastSquaresLoss", + "BlobelLoss", + "EnergyLoss", + "HellingerSurrogateLoss", + "CombinedLoss", + "TikhonovRegularization", + "TikhonovRegularized", + "ClassRepresentation", + "HistogramRepresentation", + "DistanceRepresentation", + "KernelRepresentation", + "EnergyKernelRepresentation", + "LaplacianKernelRepresentation", + "GaussianKernelRepresentation", + "GaussianRFFKernelRepresentation", +] def check_compatible_qunfold_version(): @@ -69,18 +71,54 @@ def check_compatible_qunfold_version(): # versions of qunfold <= 0.1.4 did not declare __version__ in the __init__.py but only in the setup.py version_str = "0.1.4" - compatible = Version(version_str) >= Version("0.1.5") + installed_ver = Version(version_str) + required_ver = Version("0.1.5") + compatible = installed_ver.base_version == required_ver.base_version or installed_ver>=required_ver return compatible -def ComposableQuantifier(loss, transformer, **kwargs): +@dataclass +class QUnfoldWrapper(BaseQuantifier,BaseMixin): + """A thin wrapper for using qunfold methods in QuaPy. + + Args: + _method: An instance of `qunfold.methods.AbstractMethod` to wrap. + + Examples: + Here, we wrap an instance of ACC to perform a grid search with QuaPy. + + >>> from qunfold import ACC + >>> qunfold_method = QUnfoldWrapper(ACC(RandomForestClassifier(obb_score=True))) + >>> quapy.model_selection.GridSearchQ( + >>> model = qunfold_method, + >>> param_grid = { # try both splitting criteria + >>> "representation__classifier__estimator__criterion": ["gini", "entropy"], + >>> }, + >>> # ... + >>> ) + """ + _method: AbstractMethod + def fit(self, X, y): # data is a qp.LabelledCollection + self._method.fit(X, y) + return self + def predict(self, X): + return self._method.predict(X) + def set_params(self, **params): + self._method.set_params(**params) + return self + def get_params(self, deep=True): + return self._method.get_params(deep) + def __str__(self): + return self._method.__str__() + +def ComposableQuantifier(loss, representation, **kwargs): """A generic quantification / unfolding method that solves a linear system of equations. This class represents any quantifier that can be described in terms of a loss function, a feature transformation, and a regularization term. In this implementation, the loss is minimized through unconstrained second-order minimization. Valid probability estimates are ensured through a soft-max trick by Bunse (2022). Args: loss: An instance of a loss class from `quapy.methods.composable`. - transformer: An instance of a transformer class from `quapy.methods.composable`. + representation: An instance of a representation class from `quapy.methods.composable`. solver (optional): The `method` argument in `scipy.optimize.minimize`. Defaults to `"trust-ncg"`. solver_options (optional): The `options` argument in `scipy.optimize.minimize`. Defaults to `{"gtol": 1e-8, "maxiter": 1000}`. seed (optional): A random number generator seed from which a numpy RandomState is created. Defaults to `None`. @@ -92,12 +130,12 @@ def ComposableQuantifier(loss, transformer, **kwargs): >>> ComposableQuantifier, >>> TikhonovRegularized, >>> LeastSquaresLoss, - >>> ClassTransformer, + >>> ClassRepresentation, >>> ) >>> from sklearn.ensemble import RandomForestClassifier >>> o_acc = ComposableQuantifier( >>> TikhonovRegularized(LeastSquaresLoss(), 0.01), - >>> ClassTransformer(RandomForestClassifier(oob_score=True)) + >>> ClassRepresentation(RandomForestClassifier(oob_score=True)) >>> ) Here, we perform hyper-parameter optimization with the ordinal ACC. @@ -105,21 +143,18 @@ def ComposableQuantifier(loss, transformer, **kwargs): >>> quapy.model_selection.GridSearchQ( >>> model = o_acc, >>> param_grid = { # try both splitting criteria - >>> "transformer__classifier__estimator__criterion": ["gini", "entropy"], + >>> "representation__classifier__estimator__criterion": ["gini", "entropy"], >>> }, >>> # ... >>> ) - + To use a classifier that does not provide the `oob_score` argument, such as logistic regression, you have to configure a cross validation of this classifier. Here, we employ 10 cross validation folds. 5 folds are the default. >>> from quapy.method.composable import CVClassifier >>> from sklearn.linear_model import LogisticRegression >>> acc_lr = ComposableQuantifier( >>> LeastSquaresLoss(), - >>> ClassTransformer(CVClassifier(LogisticRegression(), 10)) + >>> ClassRepresentation(CVClassifier(LogisticRegression(), 10)) >>> ) """ - if not check_compatible_qunfold_version(): - raise ImportError(__old_version_message) - - return QuaPyWrapper(qunfold.GenericMethod(loss, transformer, **kwargs)) + return QUnfoldWrapper(LinearMethod(loss, representation, **kwargs)) \ No newline at end of file diff --git a/quapy/method/confidence.py b/quapy/method/confidence.py index f54768c..07b2b1e 100644 --- a/quapy/method/confidence.py +++ b/quapy/method/confidence.py @@ -88,18 +88,30 @@ class WithConfidenceABC(ABC): METHODS = ['intervals', 'ellipse', 'ellipse-clr'] @abstractmethod - def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + def predict_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): """ - Adds the method `quantify_conf` to the interface. This method returns not only the point-estimate, but + Adds the method `predict_conf` to the interface. This method returns not only the point-estimate, but also the confidence region around it. :param instances: a np.ndarray of shape (n_instances, n_features,) - :confidence_level: float in (0, 1) + :param confidence_level: float in (0, 1), default is 0.95 :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape (n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC` """ ... + def quantify_conf(self, instances, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): + """ + Alias to `predict_conf`. This method returns not only the point-estimate, but + also the confidence region around it. + + :param instances: a np.ndarray of shape (n_instances, n_features,) + :param confidence_level: float in (0, 1), default is 0.95 + :return: a tuple (`point_estimate`, `conf_region`), where `point_estimate` is a np.ndarray of shape + (n_classes,) and `conf_region` is an object from :class:`ConfidenceRegionABC` + """ + return self.predict_conf(instances=instances, confidence_level=confidence_level) + @classmethod def construct_region(cls, prev_estims, confidence_level=0.95, method='intervals'): """ @@ -227,6 +239,7 @@ class ConfidenceEllipseCLR(ConfidenceRegionABC): """ def __init__(self, X, confidence_level=0.95): + X = np.asarray(X) self.clr = CLRtransformation() Z = self.clr(X) self.mean_ = np.mean(X, axis=0) @@ -297,6 +310,9 @@ class ConfidenceIntervals(ConfidenceRegionABC): return proportion + def __repr__(self): + return '['+', '.join(f'({low:.4f}, {high:.4f})' for (low,high) in zip(self.I_low, self.I_high))+']' + class CLRtransformation: """ @@ -339,6 +355,12 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): During inference, the bootstrap repetitions are applied to the pre-classified test instances. + See + `Moreo, A., Salvati, N. + An Efficient Method for Deriving Confidence Intervals in Aggregative Quantification. + Learning to Quantify: Methods and Applications (LQ 2025), co-located at ECML-PKDD 2025. + pp 12-33 `_ + :param quantifier: an aggregative quantifier :para n_train_samples: int, the number of training resamplings (defaults to 1, set to > 1 to activate a model-based bootstrap approach) @@ -423,7 +445,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): self.aggregation_fit(classif_predictions, labels) return self - def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): predictions = self.quantifier.classify(instances) return self.aggregate_conf(predictions, confidence_level=confidence_level) @@ -437,7 +459,7 @@ class AggregativeBootstrap(WithConfidenceABC, AggregativeQuantifier): class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): """ - `Bayesian quantification `_ method, + `Bayesian quantification `_ method (by Albert Ziegler and Paweł Czyż), which is a variant of :class:`ACC` that calculates the posterior probability distribution over the prevalence vectors, rather than providing a point estimate obtained by matrix inversion. @@ -543,7 +565,7 @@ class BayesianCC(AggregativeCrispQuantifier, WithConfidenceABC): samples = self.sample_from_posterior(classif_predictions)[_bayesian.P_TEST_Y] return np.asarray(samples.mean(axis=0), dtype=float) - def quantify_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): + def predict_conf(self, instances, confidence_level=None) -> (np.ndarray, ConfidenceRegionABC): classif_predictions = self.classify(instances) point_estimate = self.aggregate(classif_predictions) samples = self.get_prevalence_samples() # available after calling "aggregate" function diff --git a/quapy/method/meta.py b/quapy/method/meta.py index 17c9903..37749e1 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -652,14 +652,26 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs): return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs) +def merge(prev_predictions, merge_fun): + prev_predictions = np.asarray(prev_predictions) + if merge_fun == 'median': + prevalences = np.median(prev_predictions, axis=0) + prevalences = F.normalize_prevalence(prevalences, method='l1') + elif merge_fun == 'mean': + prevalences = np.mean(prev_predictions, axis=0) + else: + raise NotImplementedError(f'merge function {merge_fun} not implemented!') + return prevalences + + class SCMQ(AggregativeSoftQuantifier): - MERGE_FUNCTIONS = ['median'] + MERGE_FUNCTIONS = ['median', 'mean'] def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5): self.classifier = classifier - self.quantifiers = quantifiers - assert merge_fun in self.MERGE_FUNCTIONS, f'unknwon {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}' + self.quantifiers = [deepcopy(q) for q in quantifiers] + assert merge_fun in self.MERGE_FUNCTIONS, f'unknown {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}' self.merge_fun = merge_fun self.val_split = val_split @@ -674,22 +686,51 @@ class SCMQ(AggregativeSoftQuantifier): for quantifier_i in self.quantifiers: prevalence_i = quantifier_i.aggregate(classif_predictions) prev_predictions.append(prevalence_i) - return self.merge(prev_predictions) - - def merge(self, prev_predictions): - prev_predictions = np.asarray(prev_predictions) - if self.merge_fun == 'median': - prevalences = np.median(prev_predictions, axis=0) - prevalences = F.normalize_prevalence(prevalences, method='l1') - elif self.merge_fun == 'mean': - prevalences = np.mean(prev_predictions, axis=0) - else: - raise NotImplementedError(f'merge function {self.merge_fun} not implemented!') - return prevalences - - + return merge(prev_predictions, merge_fun=self.merge_fun) +class MCSQ(BaseQuantifier): + def __init__(self, classifiers, quantifier: AggregativeSoftQuantifier, merge_fun='median', val_split=5): + self.merge_fun = merge_fun + self.val_split = val_split + self.mcsqs = [] + for classifier in classifiers: + quantifier = deepcopy(quantifier) + quantifier.classifier = classifier + self.mcsqs.append(quantifier) + + def fit(self, data: LabelledCollection): + for q in self.mcsqs: + q.fit(data, val_split=self.val_split) + return self + + def quantify(self, instances): + prev_predictions = [] + for q in self.mcsqs: + prevalence_i = q.quantify(instances) + prev_predictions.append(prevalence_i) + return merge(prev_predictions, merge_fun=self.merge_fun) + + +class MCMQ(BaseQuantifier): + def __init__(self, classifiers, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5): + self.merge_fun = merge_fun + self.scmqs = [] + for classifier in classifiers: + self.scmqs.append(SCMQ(classifier, quantifiers, val_split=val_split)) + + def fit(self, data: LabelledCollection): + for q in self.scmqs: + q.fit(data) + return self + + def quantify(self, instances): + prev_predictions = [] + for q in self.scmqs: + prevalence_i = q.quantify(instances) + prev_predictions.append(prevalence_i) + return merge(prev_predictions, merge_fun=self.merge_fun) + diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index eff2283..ae894fd 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -1,11 +1,17 @@ -from typing import Union, Callable +from itertools import product +from tqdm import tqdm +from typing import Union, Callable, Counter import numpy as np from sklearn.feature_extraction.text import CountVectorizer +from sklearn.utils import resample +from sklearn.preprocessing import normalize +from method.confidence import WithConfidenceABC, ConfidenceRegionABC from quapy.functional import get_divergence -from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier, BinaryQuantifier import quapy.functional as F +from scipy.optimize import lsq_linear +from scipy import sparse class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): @@ -149,53 +155,164 @@ class DMx(BaseQuantifier): return F.argmin_prevalence(loss, n_classes, method=self.search) -# class ReadMe(BaseQuantifier): -# -# def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs): -# raise NotImplementedError('under development ...') -# self.bootstrap_trials = bootstrap_trials -# self.bootstrap_range = bootstrap_range -# self.bagging_trials = bagging_trials -# self.bagging_range = bagging_range -# self.vectorizer_kwargs = vectorizer_kwargs -# -# def fit(self, data: LabelledCollection): -# X, y = data.Xy -# self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs) -# X = self.vectorizer.fit_transform(X) -# self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)} -# -# def predict(self, X): -# X = self.vectorizer.transform(X) -# -# # number of features -# num_docs, num_feats = X.shape -# -# # bootstrap -# p_boots = [] -# for _ in range(self.bootstrap_trials): -# docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False) -# class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()} -# Xboot = X[docs_idx] -# -# # bagging -# p_bags = [] -# for _ in range(self.bagging_trials): -# feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False) -# class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()} -# Xbag = Xboot[:,feat_idx] -# p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag) -# p_bags.append(p) -# p_boots.append(np.mean(p_bags, axis=0)) -# -# p_mean = np.mean(p_boots, axis=0) -# p_std = np.std(p_bags, axis=0) -# -# return p_mean -# -# -# def std_constrained_linear_ls(self, X, class_cond_X: dict): -# pass + + +class ReadMe(BaseQuantifier, WithConfidenceABC): + """ + ReadMe is a non-aggregative quantification system proposed by + `Daniel Hopkins and Gary King, 2007. A method of automated nonparametric content analysis for + social science. American Journal of Political Science, 54(1):229–247. + `_. + The idea is to estimate `Q(Y=i)` directly from: + + :math:`Q(X)=\\sum_{i=1} Q(X|Y=i) Q(Y=i)` + + via least-squares regression, i.e., without incurring the cost of computing posterior probabilities. + However, this poses a very difficult representation in which the vector `Q(X)` and the matrix `Q(X|Y=i)` + can be of very high dimensions. In order to render the problem tracktable, ReadMe performs bagging in + the feature space. ReadMe also combines bagging with bootstrap in order to derive confidence intervals + around point estimations. + + We use the same default parameters as in the official + `R implementation `_. + + :param prob_model: str ('naive', or 'full'), selects the modality in which the probabilities `Q(X)` and + `Q(X|Y)` are to be modelled. Options include "full", which corresponds to the original formulation of + ReadMe, in which X is constrained to be a binary matrix (e.g., of term presence/absence) and in which + `Q(X)` and `Q(X|Y)` are modelled, respectively, as matrices of `(2^K, 1)` and `(2^K, n)` values, where + `K` is the number of columns in the data matrix (i.e., `bagging_range`), and `n` is the number of classes. + Of course, this approach is computationally prohibited for large `K`, so the computation is restricted to data + matrices with `K<=25` (although we recommend even smaller values of `K`). A much faster model is "naive", which + considers the `Q(X)` and `Q(X|Y)` be multinomial distributions under the `bag-of-words` perspective. In this + case, `bagging_range` can be set to much larger values. Default is "full" (i.e., original ReadMe behavior). + :param bootstrap_trials: int, number of bootstrap trials (default 300) + :param bagging_trials: int, number of bagging trials (default 300) + :param bagging_range: int, number of features to keep for each bagging trial (default 15) + :param confidence_level: float, a value in (0,1) reflecting the desired confidence level (default 0.95) + :param region: str in 'intervals', 'ellipse', 'ellipse-clr'; indicates the preferred method for + defining the confidence region (see :class:`WithConfidenceABC`) + :param random_state: int or None, allows replicability (default None) + :param verbose: bool, whether to display information during the process (default False) + """ + + MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION = 25 + PROBABILISTIC_MODELS = ["naive", "full"] + + def __init__(self, + prob_model="full", + bootstrap_trials=300, + bagging_trials=300, + bagging_range=15, + confidence_level=0.95, + region='intervals', + random_state=None, + verbose=False): + assert prob_model in ReadMe.PROBABILISTIC_MODELS, \ + f'unknown {prob_model=}, valid ones are {ReadMe.PROBABILISTIC_MODELS=}' + self.prob_model = prob_model + self.bootstrap_trials = bootstrap_trials + self.bagging_trials = bagging_trials + self.bagging_range = bagging_range + self.confidence_level = confidence_level + self.region = region + self.random_state = random_state + self.verbose = verbose + + def fit(self, X, y): + self._check_matrix(X) + + self.rng = np.random.default_rng(self.random_state) + self.classes_ = np.unique(y) + + + Xsize = X.shape[0] + + # Bootstrap loop + self.Xboots, self.yboots = [], [] + for _ in range(self.bootstrap_trials): + idx = self.rng.choice(Xsize, size=Xsize, replace=True) + self.Xboots.append(X[idx]) + self.yboots.append(y[idx]) + + return self + + def predict_conf(self, X, confidence_level=0.95) -> (np.ndarray, ConfidenceRegionABC): + self._check_matrix(X) + + n_features = X.shape[1] + boots_prevalences = [] + for Xboots, yboots in tqdm( + zip(self.Xboots, self.yboots), + desc='bootstrap predictions', total=self.bootstrap_trials, disable=not self.verbose + ): + bagging_estimates = [] + for _ in range(self.bagging_trials): + feat_idx = self.rng.choice(n_features, size=self.bagging_range, replace=False) + Xboots_bagging = Xboots[:, feat_idx] + X_boots_bagging = X[:, feat_idx] + bagging_prev = self._quantify_iteration(Xboots_bagging, yboots, X_boots_bagging) + bagging_estimates.append(bagging_prev) + + boots_prevalences.append(np.mean(bagging_estimates, axis=0)) + + conf = WithConfidenceABC.construct_region(boots_prevalences, confidence_level, method=self.region) + prev_estim = conf.point_estimate() + + return prev_estim, conf + + def predict(self, X): + prev_estim, _ = self.predict_conf(X) + return prev_estim + + def _quantify_iteration(self, Xtr, ytr, Xte): + """Single ReadMe estimate.""" + PX_given_Y = np.asarray([self._compute_P(Xtr[ytr == c]) for i,c in enumerate(self.classes_)]) + PX = self._compute_P(Xte) + + res = lsq_linear(A=PX_given_Y.T, b=PX, bounds=(0, 1)) + pY = np.maximum(res.x, 0) + return pY / pY.sum() + + def _check_matrix(self, X): + """the "full" model requires estimating empirical distributions; due to the high computational cost, + this function is only made available for binary matrices""" + if self.prob_model == 'full' and not self._is_binary_matrix(X): + raise ValueError('the empirical distribution can only be computed efficiently on binary matrices') + + def _is_binary_matrix(self, X): + data = X.data if sparse.issparse(X) else X + return np.all((data == 0) | (data == 1)) + + def _compute_P(self, X): + if self.prob_model == 'naive': + return self._multinomial_distribution(X) + elif self.prob_model == 'full': + return self._empirical_distribution(X) + else: + raise ValueError(f'unknown {self.prob_model}; valid ones are {ReadMe.PROBABILISTIC_MODELS=}') + + def _empirical_distribution(self, X): + + if X.shape[1] > self.MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION: + raise ValueError(f'the empirical distribution can only be computed efficiently for dimensions ' + f'less or equal than {self.MAX_FEATURES_FOR_EMPIRICAL_ESTIMATION}') + + # we convert every binary row (e.g., 0 0 1 0 1) into the equivalent number (e.g., 5) + K = X.shape[1] + binary_powers = 1 << np.arange(K-1, -1, -1) # (2^K, ..., 32, 16, 8, 4, 2, 1) + X_as_binary_numbers = X @ binary_powers + + # count occurrences and compute probs + counts = np.bincount(X_as_binary_numbers, minlength=2 ** K).astype(float) + probs = counts / counts.sum() + return probs + + def _multinomial_distribution(self, X): + PX = np.asarray(X.sum(axis=0)) + PX = normalize(PX, norm='l1', axis=1) + return PX.ravel() + + def _get_features_range(X): diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 8d29877..0937fa8 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -86,14 +86,14 @@ class GridSearchQ(BaseQuantifier): self.n_jobs = qp._get_njobs(n_jobs) self.raise_errors = raise_errors self.verbose = verbose - self.__check_error(error) + self.__check_error_measure(error) assert isinstance(protocol, AbstractProtocol), 'unknown protocol' def _sout(self, msg): if self.verbose: print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}') - def __check_error(self, error): + def __check_error_measure(self, error): if error in qp.error.QUANTIFICATION_ERROR: self.error = error elif isinstance(error, str): @@ -253,13 +253,13 @@ class GridSearchQ(BaseQuantifier): self.param_scores_[str(params)] = status.status self.error_collector.append(status) - tend = time()-tinit + self.fit_time_ = time()-tinit if self.best_score_ is None: raise ValueError('no combination of hyperparameters seemed to work') self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) ' - f'[took {tend:.4f}s]') + f'[took {self.fit_time_:.4f}s]') no_errors = len(self.error_collector) if no_errors>0: diff --git a/quapy/plot.py b/quapy/plot.py index 319c1df..5c5e4b4 100644 --- a/quapy/plot.py +++ b/quapy/plot.py @@ -609,4 +609,40 @@ def _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error if method not in method_order: method_order.append(method) - return data \ No newline at end of file + return data + + +def calibration_plot(prob_classifier, X, y, nbins=10, savepath=None): + posteriors = prob_classifier.predict_proba(X) + assert posteriors.ndim==2, 'calibration plot only works for binary problems' + posteriors = posteriors[:,1] + pred_y = posteriors>=0.5 + bins = np.linspace(0, 1, nbins + 1) + binned_values = np.digitize(posteriors, bins, right=False) + print(np.unique(binned_values)) + correct = pred_y == y + bin_centers = (bins[:-1] + bins[1:]) / 2 + bins_names = np.arange(nbins) + y_axis = [correct[binned_values==bin].mean() for bin in bins_names] + y_axis = [v if not np.isnan(v) else 0 for v in y_axis] + # Crear el gráfico de barras + plt.bar(bin_centers, y_axis, width=bins[1]-bins[0], edgecolor='black', alpha=0.7) + + # Etiquetas y título + plt.xlabel("Bin") + plt.ylabel("Value") + plt.title("Bar plot of calculated values per bin") + plt.xticks(bin_centers, [f"{b:.2f}" for b in bin_centers], rotation=45) + + # Mostrar el gráfico + plt.tight_layout() + plt.show() + +if __name__ == '__main__': + import quapy as qp + from sklearn.linear_model import LogisticRegression + data = qp.datasets.fetch_UCIBinaryDataset(qp.datasets.UCI_BINARY_DATASETS[6]) + train, test = data.train_test + classifier = LogisticRegression() + classifier.fit(*train.Xy) + calibration_plot(classifier, *test.Xy) diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py index cc09f16..de5f61a 100644 --- a/quapy/tests/test_datasets.py +++ b/quapy/tests/test_datasets.py @@ -15,8 +15,11 @@ class TestDatasets(unittest.TestCase): return PCC(LogisticRegression(C=0.001, max_iter=100)) def _check_dataset(self, dataset): + train, test = dataset.reduce().train_test q = self.new_quantifier() print(f'testing method {q} in {dataset.name}...', end='') + if len(train)>500: + train = train.sampling(500) q.fit(*dataset.training.Xy) estim_prevalences = q.predict(dataset.test.instances) self.assertTrue(F.check_prevalence_vector(estim_prevalences)) @@ -42,7 +45,9 @@ class TestDatasets(unittest.TestCase): self._check_dataset(dataset) def test_twitter(self): - for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST: + # all the datasets are contained in the same resource; if the first one + # works, there is no need to test for the rest + for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST[:1]: print(f'loading dataset {dataset_name}...', end='') dataset = fetch_twitter(dataset_name, min_df=10) dataset.stats() @@ -129,7 +134,7 @@ class TestDatasets(unittest.TestCase): n_classes = train.n_classes train = train.sampling(100, *F.uniform_prevalence(n_classes)) q = self.new_quantifier() - q.fit(train) + q.fit(*train.Xy) self._check_samples(gen, q, max_samples_test=5) diff --git a/quapy/tests/test_hierarchy.py b/quapy/tests/test_hierarchy.py index 0cf9b9b..7a2d07e 100644 --- a/quapy/tests/test_hierarchy.py +++ b/quapy/tests/test_hierarchy.py @@ -9,9 +9,8 @@ import inspect class HierarchyTestCase(unittest.TestCase): def test_aggregative(self): - lr = LogisticRegression() for m in AGGREGATIVE_METHODS: - self.assertEqual(isinstance(m(lr), AggregativeQuantifier), True) + self.assertEqual(isinstance(m(), AggregativeQuantifier), True) def test_inspect_aggregative(self): @@ -22,6 +21,7 @@ class HierarchyTestCase(unittest.TestCase): quantifiers = [cls for cls in classes if issubclass(cls, BaseQuantifier)] quantifiers = [cls for cls in quantifiers if issubclass(cls, AggregativeQuantifier)] quantifiers = [cls for cls in quantifiers if not inspect.isabstract(cls) ] + quantifiers = [cls for cls in quantifiers if cls is not OneVsAllAggregative] for cls in quantifiers: self.assertIn(cls, AGGREGATIVE_METHODS) diff --git a/quapy/tests/test_methods.py b/quapy/tests/test_methods.py index 533bf1a..3e4149e 100644 --- a/quapy/tests/test_methods.py +++ b/quapy/tests/test_methods.py @@ -16,21 +16,21 @@ from quapy.method.composable import ( ComposableQuantifier, LeastSquaresLoss, HellingerSurrogateLoss, - ClassTransformer, - HistogramTransformer, + ClassRepresentation, + HistogramRepresentation, CVClassifier ) COMPOSABLE_METHODS = [ ComposableQuantifier( # ACC LeastSquaresLoss(), - ClassTransformer(CVClassifier(LogisticRegression())) + ClassRepresentation(CVClassifier(LogisticRegression())) ), ComposableQuantifier( # HDy HellingerSurrogateLoss(), - HistogramTransformer( + HistogramRepresentation( 3, # 3 bins per class - preprocessor = ClassTransformer(CVClassifier(LogisticRegression())) + preprocessor = ClassRepresentation(CVClassifier(LogisticRegression())) ) ), ] @@ -113,7 +113,6 @@ class TestMethods(unittest.TestCase): self.assertTrue(check_prevalence_vector(estim_prevalences)) def test_composable(self): - from packaging.version import Version if check_compatible_qunfold_version(): for dataset in TestMethods.datasets: for q in COMPOSABLE_METHODS: diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index c13b665..6423b4e 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -39,31 +39,30 @@ class ModselTestCase(unittest.TestCase): obtains the same optimal parameters """ - q = PACC(LogisticRegression(random_state=1, max_iter=500)) + q = PACC(LogisticRegression(random_state=1, max_iter=3000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50).reduce(n_train=500, random_state=1) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=50) training, validation = data.training.split_stratified(0.7, random_state=1) - param_grid = {'classifier__C': np.logspace(-3,3,7)} + param_grid = {'classifier__C': np.logspace(-3,3,7), 'classifier__class_weight': ['balanced', None]} app = APP(validation, sample_size=100, random_state=1) - print('starting model selection in sequential exploration') - tinit = time.time() - modsel = GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True - ).fit(*training.Xy) - tend_seq = time.time()-tinit - best_c_seq = modsel.best_params_['classifier__C'] - print(f'[done] took {tend_seq:.2f}s best C = {best_c_seq}') + def do_gridsearch(n_jobs): + print('starting model selection in sequential exploration') + t_init = time.time() + modsel = GridSearchQ( + q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=n_jobs, verbose=True + ).fit(*training.Xy) + t_end = time.time()-t_init + best_c = modsel.best_params_['classifier__C'] + print(f'[done] took {t_end:.2f}s best C = {best_c}') + return t_end, best_c - print('starting model selection in parallel exploration') - tinit = time.time() - modsel = GridSearchQ( - q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True - ).fit(*training.Xy) - tend_par = time.time() - tinit - best_c_par = modsel.best_params_['classifier__C'] - print(f'[done] took {tend_par:.2f}s best C = {best_c_par}') + tend_seq, best_c_seq = do_gridsearch(n_jobs=1) + tend_par, best_c_par = do_gridsearch(n_jobs=-1) + + print(tend_seq, best_c_seq) + print(tend_par, best_c_par) self.assertEqual(best_c_seq, best_c_par) self.assertLess(tend_par, tend_seq) diff --git a/setup.py b/setup.py index bb8ad80..ba5f205 100644 --- a/setup.py +++ b/setup.py @@ -160,7 +160,7 @@ setup( 'Contributors': 'https://github.com/HLT-ISTI/QuaPy/graphs/contributors', 'Bug Reports': 'https://github.com/HLT-ISTI/QuaPy/issues', 'Wiki': 'https://github.com/HLT-ISTI/QuaPy/wiki', - 'Documentation': 'https://hlt-isti.github.io/QuaPy/build/html/index.html', + 'Documentation': 'https://hlt-isti.github.io/QuaPy/', 'Source': 'https://github.com/HLT-ISTI/QuaPy/', }, ) diff --git a/testing_refactor.py b/testing_refactor.py deleted file mode 100644 index c73d31c..0000000 --- a/testing_refactor.py +++ /dev/null @@ -1,26 +0,0 @@ -from sklearn.linear_model import LogisticRegression -import quapy as qp -from method.aggregative import * - -datasets = qp.datasets.UCI_MULTICLASS_DATASETS[1] -data = qp.datasets.fetch_UCIMulticlassDataset(datasets) -train, test = data.train_test - -Xtr, ytr = train.Xy -Xte = test.X - -quant = EMQ(LogisticRegression(), calib='bcts') -quant.fit(Xtr, ytr) -prev = quant.predict(Xte) - -print(prev) -post = quant.predict_proba(Xte) -print(post) -post = quant.classify(Xte) -print(post) - -# AggregativeMedianEstimator() - - -# test CC, prevent from doing 5FCV for nothing -# test PACC o PCC with LinearSVC; removing "adapt_if_necessary" form _check_classifier \ No newline at end of file