forked from moreo/QuaPy
aggregation methods updated
This commit is contained in:
parent
9c8d29156c
commit
2361186a01
|
@ -1,4 +1,4 @@
|
|||
from .dataset import *
|
||||
from .data import *
|
||||
from . import functional
|
||||
from . import method
|
||||
from . import error
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from dataset.base import Dataset
|
||||
from data.base import Dataset
|
||||
from scipy.sparse import spmatrix
|
||||
from utils.util import parallelize
|
||||
from .base import LabelledCollection
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
||||
|
@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
|
|||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
consisting of lists of integer values representing indices.
|
||||
"""
|
||||
__check_type(dataset.training.instances, list, str)
|
||||
__check_type(dataset.test.instances, list, str)
|
||||
__check_type(dataset.training.instances, np.ndarray, str)
|
||||
__check_type(dataset.test.instances, np.ndarray, str)
|
||||
|
||||
indexer = IndexTransformer(min_df=min_df, **kwargs)
|
||||
training_index = indexer.fit_transform(dataset.training.instances)
|
||||
|
@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None):
|
|||
f'unexpected type of element (expected {container_type}, found {type(container)})'
|
||||
|
||||
|
||||
|
||||
class IndexTransformer:
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
@ -140,7 +140,7 @@ class IndexTransformer:
|
|||
return self.fit(X).transform(X, n_jobs=n_jobs)
|
||||
|
||||
def vocabulary_size(self):
|
||||
return len(self.vocabulary_) + 1 # the reserved unk token
|
||||
return len(self.vocabulary_)
|
||||
|
||||
def add_word(self, word):
|
||||
if word in self.vocabulary_:
|
|
@ -1,5 +1,6 @@
|
|||
from . import base
|
||||
from . import aggregative as agg
|
||||
from . import non_aggregative as nagg
|
||||
from . import non_aggregative
|
||||
|
||||
|
||||
AGGREGATIVE_METHODS = {
|
||||
|
@ -13,13 +14,10 @@ AGGREGATIVE_METHODS = {
|
|||
}
|
||||
|
||||
NON_AGGREGATIVE_METHODS = {
|
||||
nagg.MaximumLikelihoodPrevalenceEstimation
|
||||
non_aggregative.MaximumLikelihoodPrevalenceEstimation
|
||||
}
|
||||
|
||||
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
|
||||
|
||||
|
||||
# common alisases
|
||||
MLPE = nagg.MaximumLikelihoodPrevalenceEstimation
|
||||
|
||||
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import numpy as np
|
||||
from .base import *
|
||||
from ..error import mae
|
||||
from copy import deepcopy
|
||||
import functional as F
|
||||
from ..classification.svmperf import SVMperf
|
||||
from ..dataset import LabelledCollection
|
||||
import error
|
||||
from method.base import BaseQuantifier
|
||||
from quapy.classification.svmperf import SVMperf
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
|
||||
from abc import abstractmethod
|
||||
|
||||
|
||||
# Abstract classes
|
||||
|
@ -23,6 +23,14 @@ class AggregativeQuantifier(BaseQuantifier):
|
|||
@abstractmethod
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
|
||||
|
||||
@property
|
||||
def learner(self):
|
||||
return self.learner_
|
||||
|
||||
@learner.setter
|
||||
def learner(self, value):
|
||||
self.learner_ = value
|
||||
|
||||
def classify(self, instances):
|
||||
return self.learner.predict(instances)
|
||||
|
||||
|
@ -69,12 +77,12 @@ def training_helper(learner,
|
|||
Training procedure common to all Aggregative Quantifiers.
|
||||
:param learner: the learner to be fit
|
||||
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
|
||||
:param fit_learner: whether or not to fit the learner
|
||||
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
|
||||
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
|
||||
learner is not probabilistic, then a CalibratedCV instance of it is trained)
|
||||
:param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
|
||||
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
|
||||
or None otherwise)
|
||||
or None otherwise) to be used as a validation set for any subsequent parameter fitting
|
||||
"""
|
||||
if fit_learner:
|
||||
if ensure_probabilistic:
|
||||
|
@ -239,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
|||
# M-step: qs_pos is Ps+1(y=+1)
|
||||
qs = ps.mean(axis=0)
|
||||
|
||||
if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10:
|
||||
if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10:
|
||||
converged = True
|
||||
|
||||
qs_prev_ = qs
|
||||
|
@ -265,7 +273,8 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
|||
self.learner = learner
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
||||
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
|
||||
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
|
||||
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
||||
self.learner, validation = training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
||||
Px = self.soft_classify(validation.instances)
|
||||
|
@ -304,15 +313,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
|||
|
||||
|
||||
class OneVsAll(AggregativeQuantifier):
|
||||
"""
|
||||
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
|
||||
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||
"""
|
||||
|
||||
def __init__(self, binary_method, n_jobs=-1, **kwargs):
|
||||
def __init__(self, binary_method, n_jobs=-1):
|
||||
self.binary_method = binary_method
|
||||
self.n_jobs = n_jobs
|
||||
self.kwargs = kwargs
|
||||
|
||||
def fit(self, data: LabelledCollection, **kwargs):
|
||||
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
|
||||
self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_}
|
||||
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
|
||||
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
|
||||
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
|
||||
)
|
||||
|
@ -332,10 +345,10 @@ class OneVsAll(AggregativeQuantifier):
|
|||
return sorted(self.class_method.keys())
|
||||
|
||||
def set_params(self, **parameters):
|
||||
self.kwargs=parameters
|
||||
self.binary_method.set_params(**parameters)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.kwargs
|
||||
return self.binary_method.get_params()
|
||||
|
||||
def _delayed_binary_predict(self, c, learners, X):
|
||||
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
|
||||
|
@ -346,6 +359,12 @@ class OneVsAll(AggregativeQuantifier):
|
|||
|
||||
|
||||
class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||
"""
|
||||
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
|
||||
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
|
||||
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||
Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||
"""
|
||||
|
||||
def __init__(self, svmperf_base, loss, **kwargs):
|
||||
self.svmperf_base = svmperf_base
|
||||
|
@ -354,16 +373,9 @@ class ExplicitLossMinimisation(AggregativeQuantifier):
|
|||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||
assert fit_learner, 'the method requires that fit_learner=True'
|
||||
if data.binary:
|
||||
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||
else:
|
||||
self.learner = OneVsAll(
|
||||
binary_method=ExplicitLossMinimisationBinary,
|
||||
n_jobs=-1,
|
||||
svmperf_base=self.svmperf_base,
|
||||
loss=self.loss,
|
||||
**self.kwargs
|
||||
)
|
||||
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
|
||||
if not data.binary:
|
||||
self.learner = OneVsAll(self.learner, n_jobs=-1)
|
||||
return self.learner.fit(data, *args)
|
||||
|
||||
def quantify(self, instances, *args):
|
||||
|
@ -393,6 +405,7 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
|
|||
return self.learner.predict(X)
|
||||
|
||||
|
||||
|
||||
class SVMQ(ExplicitLossMinimisation):
|
||||
def __init__(self, svmperf_base, **kwargs):
|
||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
import quapy as qp
|
||||
|
||||
|
||||
# Base Quantifier abstract class
|
||||
|
@ -7,7 +6,7 @@ import quapy as qp
|
|||
class BaseQuantifier(metaclass=ABCMeta):
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, data: qp.LabelledCollection, *args): ...
|
||||
def fit(self, data, *args): ...
|
||||
|
||||
@abstractmethod
|
||||
def quantify(self, instances, *args): ...
|
||||
|
|
38
test.py
38
test.py
|
@ -2,23 +2,25 @@ from sklearn.linear_model import LogisticRegression
|
|||
from sklearn.svm import LinearSVC
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
|
||||
from method.aggregative import OneVsAll
|
||||
|
||||
# load a textual binary dataset and create a tfidf bag of words
|
||||
#from method.aggregative import OneVsAll, BaseQuantifier
|
||||
|
||||
train_path = './datasets/reviews/kindle/train.txt'
|
||||
test_path = './datasets/reviews/kindle/test.txt'
|
||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
|
||||
dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
|
||||
dataset.test = dataset.test.sampling(500, 0.6, 0.4)
|
||||
qp.preprocessing.text2tfidf(dataset, inplace=True)
|
||||
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
||||
#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
|
||||
#dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
|
||||
#dataset.test = dataset.test.sampling(500, 0.6, 0.4)
|
||||
#qp.preprocessing.text2tfidf(dataset, inplace=True)
|
||||
#qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
||||
|
||||
# load a sparse matrix ternary dataset
|
||||
#train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
||||
#test_path = './datasets/twitter/test/sst.test.feature.txt'
|
||||
#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
||||
#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5)
|
||||
#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
|
||||
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
||||
test_path = './datasets/twitter/test/sst.test.feature.txt'
|
||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
||||
dataset.training = dataset.training.sampling(500, 0.3, 0.4, 0.3)
|
||||
dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
|
||||
|
||||
# training a quantifier
|
||||
learner = LogisticRegression()
|
||||
|
@ -30,17 +32,23 @@ learner = LogisticRegression()
|
|||
# q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
||||
# q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000)
|
||||
# q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000)
|
||||
q = qp.method.aggregative.HDy(learner)
|
||||
q.fit(dataset.training)
|
||||
#model = qp.method.aggregative.HDy(learner)
|
||||
#
|
||||
|
||||
model = qp.method.aggregative.HDy(learner)
|
||||
model = OneVsAll(model)
|
||||
print(model.get_params())
|
||||
|
||||
model.fit(dataset.training)
|
||||
|
||||
# estimating class prevalences
|
||||
prevalences_estim = q.quantify(dataset.test.instances)
|
||||
prevalences_estim = model.quantify(dataset.test.instances)
|
||||
prevalences_true = dataset.test.prevalence()
|
||||
|
||||
# evaluation (one single prediction)
|
||||
error = qp.error.mae(prevalences_true, prevalences_estim)
|
||||
|
||||
print(f'method {q.__class__.__name__}')
|
||||
print(f'method {model.__class__.__name__}')
|
||||
print(f'true prevalence {F.strprev(prevalences_true)}')
|
||||
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
||||
print(f'MAE={error:.3f}')
|
Loading…
Reference in New Issue