From 91f8d8f3e1b3aa921608037a2a02723f85c9cb43 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Tue, 2 Feb 2021 12:10:57 +0100 Subject: [PATCH] readme updated --- README.md | 72 ++++++++++++++++++++++++++++++++++++++++++---- TODO.txt | 7 ----- quapy/data/base.py | 1 + 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c1b7b00..9efd4b0 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,13 @@ used for evaluating quantification methods. QuaPy also integrates commonly used datasets and offers visualization tools for facilitating the analysis and interpretation of results. +## A quick example: + +The following script fetchs a Twitter dataset, trains and evaluates an +_Adjusted Classify & Count_ model in terms of the _Mean Absolute Error_ (MAE) +between the class prevalences estimated for the test set and the true prevalences +of the test set. + ```python import quapy as qp from sklearn.linear_model import LogisticRegression @@ -21,14 +28,69 @@ dataset = qp.datasets.fetch_twitter('semeval16') model = qp.method.aggregative.ACC(LogisticRegression()) model.fit(dataset.training) -prevalences_estim = model.quantify(dataset.test.instances) -prevalences_true = dataset.test.prevalence() +estim_prevalences = model.quantify(dataset.test.instances) +true_prevalences = dataset.test.prevalence() -error = qp.error.mae(prevalences_true, prevalences_estim) +error = qp.error.mae(true_prevalences, estim_prevalences) -print(f'MAE={error:.3f}') +print(f'Mean Absolute Error (MAE)={error:.3f}') ``` -binary, and single-label +Quantification is useful in scenarios of distribution shift. In other +words, we would not need to estimate the class prevalences of the test set if +we could assume the IID assumption to hold, as this prevalence would simply coincide with the +class prevalence of the training set. That is to say, a Quantification model +should be tested across samples characterized by different class prevalences. +QuaPy implements sampling procedures and evaluation protocols that automates this endeavour. +See the Wiki for detailed examples. + +## Features + +* Implementation of most popular quantification methods (Classify-&-Count variants, Expectation-Maximization, +SVM-based variants for quantification, HDy, QuaNet, and Ensembles). +* Versatile functionality for performing evaluation based on artificial sampling protocols. +* Implementation of most commonly used evaluation metrics (e.g., MAE, MRAE, MSE, NKLD, etc.). +* Popular datasets for Quantification (textual and numeric) available, including: + * 32 UCI Machine Learning datasets. + * 11 Twitter Sentiment datasets. + * 3 Reviews Sentiment datasets. +* Native supports for binary and single-label scenarios of quantification. +* Model selection functionality targeting quantification-oriented losses. +* Plotting routines ("error-by-drift", "diagonal", and "bias" plots). + +## Requirements + +* sklearnm, numpy, scipy +* pytorch (for QuaNet) +* svmperf patched for quantification (see below) +* joblib +* tqdm +* pandas, xlrd +* matplotlib + +## SVM-perf with quantification-oriented losses +In order to run experiments involving SVM(Q), SVM(KLD), SVM(NKLD), +SVM(AE), or SVM(RAE), you have to first download the +[svmperf](http://www.cs.cornell.edu/people/tj/svm_light/svm_perf.html) +package, apply the patch +[svm-perf-quantification-ext.patch](./svm-perf-quantification-ext.patch), and compile the sources. +The script [prepare_svmperf.sh](prepare_svmperf.sh) does all the job. Simply run: + +``` +./prepare_svmperf.sh +``` + +The resulting directory [svm_perf_quantification](./svm_perf_quantification) contains the +patched version of _svmperf_ with quantification-oriented losses. + +The [svm-perf-quantification-ext.patch](./svm-perf-quantification-ext.patch) is an extension of the patch made available by +[Esuli et al. 2015](https://dl.acm.org/doi/abs/10.1145/2700406?casa_token=8D2fHsGCVn0AAAAA:ZfThYOvrzWxMGfZYlQW_y8Cagg-o_l6X_PcF09mdETQ4Tu7jK98mxFbGSXp9ZSO14JkUIYuDGFG0) +that allows SVMperf to optimize for +the _Q_ measure as proposed by [Barranquero et al. 2015](https://www.sciencedirect.com/science/article/abs/pii/S003132031400291X) +and for the _KLD_ and _NKLD_ as proposed by [Esuli et al. 2015](https://dl.acm.org/doi/abs/10.1145/2700406?casa_token=8D2fHsGCVn0AAAAA:ZfThYOvrzWxMGfZYlQW_y8Cagg-o_l6X_PcF09mdETQ4Tu7jK98mxFbGSXp9ZSO14JkUIYuDGFG0) +for quantification. +This patch extends the former by also allowing SVMperf to optimize for +_AE_ and _RAE_. + diff --git a/TODO.txt b/TODO.txt index 8fd1ff5..5fcc07a 100644 --- a/TODO.txt +++ b/TODO.txt @@ -10,14 +10,8 @@ an instance of single-label with 2 labels. Check Add classnames to LabelledCollection ? Check the overhead in OneVsAll for SVMperf-based (?) Add HDy to QuaNet? if so, wrap HDy into OneVsAll in case the dataset is not binary. -Plots (one for binary -- the "diagonal", or for a specific class), another for the error as a funcition of drift. Add datasets for topic. -Add other methods Clarify whether QuaNet is an aggregative method or not. -Add datasets from Pérez-Gallego et al. 2017, 2019 -Add ensemble models from Pérez-Gallego et al. 2017, 2019 -Add plots models like those in Pérez-Gallego et al. 2017 (error boxes) -Add support for CV prediction in ACC and PACC for tpr, fpr Add medium swap method Explore the hyperparameter "number of bins" in HDy Implement HDy for single-label? @@ -25,4 +19,3 @@ Rename EMQ to SLD ? How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up to one always? Parallelize the kFCV in ACC and PACC -Requirements: xlrd for reading excel \ No newline at end of file diff --git a/quapy/data/base.py b/quapy/data/base.py index ffa1e33..6b2ddec 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -195,6 +195,7 @@ class Dataset: print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, ' f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') + return {'train': tr_stats ,'test':te_stats} @classmethod def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):