forked from moreo/QuaPy
70 lines
3.3 KiB
Python
70 lines
3.3 KiB
Python
import quapy as qp
|
|
from quapy.data import LabelledCollection
|
|
from quapy.method.base import BinaryQuantifier
|
|
from quapy.model_selection import GridSearchQ
|
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier
|
|
from quapy.protocol import APP
|
|
import numpy as np
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
|
|
# Define a custom quantifier: for this example, we will consider a new quantification algorithm that uses a
|
|
# logistic regressor for generating posterior probabilities, and then applies a custom threshold value to the
|
|
# posteriors. Since the quantifier internally uses a classifier, it is an aggregative quantifier; and since it
|
|
# relies on posterior probabilities, it is a probabilistic-aggregative quantifier. Note also it has an
|
|
# internal hyperparameter (let say, alpha) which is the decision threshold. Let's also assume the quantifier
|
|
# is binary, for simplicity.
|
|
|
|
class MyQuantifier(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|
def __init__(self, classifier, alpha=0.5):
|
|
self.alpha = alpha
|
|
# aggregative quantifiers have an internal self.classifier attribute
|
|
self.classifier = classifier
|
|
|
|
def fit(self, data: LabelledCollection, fit_classifier=True):
|
|
assert fit_classifier, 'this quantifier needs to fit the classifier!'
|
|
self.classifier.fit(*data.Xy)
|
|
return self
|
|
|
|
# in general, we would need to implement the method quantify(self, instances) but, since this method is of
|
|
# type aggregative, we can simply implement the method aggregate, which has the following interface
|
|
def aggregate(self, classif_predictions: np.ndarray):
|
|
# the posterior probabilities have already been generated by the quantify method; we only need to
|
|
# specify what to do with them
|
|
positive_probabilities = classif_predictions[:, 1]
|
|
crisp_decisions = positive_probabilities > self.alpha
|
|
pos_prev = crisp_decisions.mean()
|
|
neg_prev = 1-pos_prev
|
|
return np.asarray([neg_prev, pos_prev])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
qp.environ['SAMPLE_SIZE'] = 100
|
|
|
|
# define an instance of our custom quantifier
|
|
quantifier = MyQuantifier(LogisticRegression(), alpha=0.5)
|
|
|
|
# load the IMDb dataset
|
|
train, test = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=5).train_test
|
|
|
|
# model selection
|
|
# let us assume we want to explore our hyperparameter alpha along with one hyperparameter of the classifier
|
|
train, val = train.split_stratified(train_prop=0.75)
|
|
param_grid = {
|
|
'alpha': np.linspace(0, 1, 11), # quantifier-dependent hyperparameter
|
|
'classifier__C': np.logspace(-2, 2, 5) # classifier-dependent hyperparameter
|
|
}
|
|
quantifier = GridSearchQ(quantifier, param_grid, protocol=APP(val), n_jobs=-1, verbose=True).fit(train)
|
|
|
|
# evaluation
|
|
mae = qp.evaluation.evaluate(quantifier, protocol=APP(test), error_metric='mae')
|
|
|
|
print(f'MAE = {mae:.4f}')
|
|
|
|
# final remarks: this method is only for demonstration purposes and makes little sense in general. The method relies
|
|
# on an hyperparameter alpha for binarizing the posterior probabilities. A much better way for fulfilling this
|
|
# goal would be to calibrate the classifier (LogisticRegression is already reasonably well calibrated) and then
|
|
# simply cut at 0.5.
|
|
|