1
0
Fork 0

aggregation methods updated

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-09 12:46:50 +01:00
parent 9c8d29156c
commit 2361186a01
9 changed files with 71 additions and 53 deletions

View File

@ -1,4 +1,4 @@
from .dataset import *
from .data import *
from . import functional
from . import method
from . import error

View File

@ -1,9 +1,10 @@
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from dataset.base import Dataset
from data.base import Dataset
from scipy.sparse import spmatrix
from utils.util import parallelize
from .base import LabelledCollection
from tqdm import tqdm
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
@ -78,8 +79,8 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
consisting of lists of integer values representing indices.
"""
__check_type(dataset.training.instances, list, str)
__check_type(dataset.test.instances, list, str)
__check_type(dataset.training.instances, np.ndarray, str)
__check_type(dataset.test.instances, np.ndarray, str)
indexer = IndexTransformer(min_df=min_df, **kwargs)
training_index = indexer.fit_transform(dataset.training.instances)
@ -105,7 +106,6 @@ def __check_type(container, container_type=None, element_type=None):
f'unexpected type of element (expected {container_type}, found {type(container)})'
class IndexTransformer:
def __init__(self, **kwargs):
@ -140,7 +140,7 @@ class IndexTransformer:
return self.fit(X).transform(X, n_jobs=n_jobs)
def vocabulary_size(self):
return len(self.vocabulary_) + 1 # the reserved unk token
return len(self.vocabulary_)
def add_word(self, word):
if word in self.vocabulary_:

View File

@ -1,5 +1,6 @@
from . import base
from . import aggregative as agg
from . import non_aggregative as nagg
from . import non_aggregative
AGGREGATIVE_METHODS = {
@ -13,13 +14,10 @@ AGGREGATIVE_METHODS = {
}
NON_AGGREGATIVE_METHODS = {
nagg.MaximumLikelihoodPrevalenceEstimation
non_aggregative.MaximumLikelihoodPrevalenceEstimation
}
QUANTIFICATION_METHODS = AGGREGATIVE_METHODS | NON_AGGREGATIVE_METHODS
# common alisases
MLPE = nagg.MaximumLikelihoodPrevalenceEstimation

View File

@ -1,14 +1,14 @@
import numpy as np
from .base import *
from ..error import mae
from copy import deepcopy
import functional as F
from ..classification.svmperf import SVMperf
from ..dataset import LabelledCollection
import error
from method.base import BaseQuantifier
from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection
from sklearn.metrics import confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from joblib import Parallel, delayed
from abc import abstractmethod
# Abstract classes
@ -23,6 +23,14 @@ class AggregativeQuantifier(BaseQuantifier):
@abstractmethod
def fit(self, data: LabelledCollection, fit_learner=True, *args): ...
@property
def learner(self):
return self.learner_
@learner.setter
def learner(self, value):
self.learner_ = value
def classify(self, instances):
return self.learner.predict(instances)
@ -69,12 +77,12 @@ def training_helper(learner,
Training procedure common to all Aggregative Quantifiers.
:param learner: the learner to be fit
:param data: the data on which to fit the learner. If requested, the data will be split before fitting the learner.
:param fit_learner: whether or not to fit the learner
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
learner is not probabilistic, then a CalibratedCV instance of it is trained)
:param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
or None otherwise)
or None otherwise) to be used as a validation set for any subsequent parameter fitting
"""
if fit_learner:
if ensure_probabilistic:
@ -239,7 +247,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
# M-step: qs_pos is Ps+1(y=+1)
qs = ps.mean(axis=0)
if qs_prev_ is not None and mae(qs, qs_prev_) < epsilon and s>10:
if qs_prev_ is not None and error.mae(qs, qs_prev_) < epsilon and s>10:
converged = True
qs_prev_ = qs
@ -265,7 +273,8 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification'
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
Px = self.soft_classify(validation.instances)
@ -304,15 +313,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
class OneVsAll(AggregativeQuantifier):
"""
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
"""
def __init__(self, binary_method, n_jobs=-1, **kwargs):
def __init__(self, binary_method, n_jobs=-1):
self.binary_method = binary_method
self.n_jobs = n_jobs
self.kwargs = kwargs
def fit(self, data: LabelledCollection, **kwargs):
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
self.class_method = {c: self.binary_method(**self.kwargs) for c in data.classes_}
assert isinstance(self.binary_method, BaseQuantifier), f'{self.binary_method} does not seem to be a Quantifier'
self.class_method = {c: deepcopy(self.binary_method) for c in data.classes_}
Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(self._delayed_binary_fit)(c, self.class_method, data, **kwargs) for c in data.classes_
)
@ -332,10 +345,10 @@ class OneVsAll(AggregativeQuantifier):
return sorted(self.class_method.keys())
def set_params(self, **parameters):
self.kwargs=parameters
self.binary_method.set_params(**parameters)
def get_params(self, deep=True):
return self.kwargs
return self.binary_method.get_params()
def _delayed_binary_predict(self, c, learners, X):
return learners[c].classify(X).mean() # the mean is the estimation for the positive class prevalence
@ -346,6 +359,12 @@ class OneVsAll(AggregativeQuantifier):
class ExplicitLossMinimisation(AggregativeQuantifier):
"""
A variant of Explicit Loss Minimisation based on SVMperf that works also on single-label data. It uses one binary
quantifier for each class and then l1-normalizes the class predictions so that they sum up to one.
This variant was used in Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
"""
def __init__(self, svmperf_base, loss, **kwargs):
self.svmperf_base = svmperf_base
@ -354,16 +373,9 @@ class ExplicitLossMinimisation(AggregativeQuantifier):
def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert fit_learner, 'the method requires that fit_learner=True'
if data.binary:
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
else:
self.learner = OneVsAll(
binary_method=ExplicitLossMinimisationBinary,
n_jobs=-1,
svmperf_base=self.svmperf_base,
loss=self.loss,
**self.kwargs
)
self.learner = ExplicitLossMinimisationBinary(self.svmperf_base, self.loss, **self.kwargs)
if not data.binary:
self.learner = OneVsAll(self.learner, n_jobs=-1)
return self.learner.fit(data, *args)
def quantify(self, instances, *args):
@ -393,6 +405,7 @@ class ExplicitLossMinimisationBinary(AggregativeQuantifier):
return self.learner.predict(X)
class SVMQ(ExplicitLossMinimisation):
def __init__(self, svmperf_base, **kwargs):
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)

View File

@ -1,5 +1,4 @@
from abc import ABCMeta, abstractmethod
import quapy as qp
# Base Quantifier abstract class
@ -7,7 +6,7 @@ import quapy as qp
class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod
def fit(self, data: qp.LabelledCollection, *args): ...
def fit(self, data, *args): ...
@abstractmethod
def quantify(self, instances, *args): ...

38
test.py
View File

@ -2,23 +2,25 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from method.aggregative import OneVsAll
# load a textual binary dataset and create a tfidf bag of words
#from method.aggregative import OneVsAll, BaseQuantifier
train_path = './datasets/reviews/kindle/train.txt'
test_path = './datasets/reviews/kindle/test.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
dataset.test = dataset.test.sampling(500, 0.6, 0.4)
qp.preprocessing.text2tfidf(dataset, inplace=True)
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
#dataset.training = dataset.training.sampling(1000, 0.4, 0.6)
#dataset.test = dataset.test.sampling(500, 0.6, 0.4)
#qp.preprocessing.text2tfidf(dataset, inplace=True)
#qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
# load a sparse matrix ternary dataset
#train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
#test_path = './datasets/twitter/test/sst.test.feature.txt'
#dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
#dataset.training = dataset.training.sampling(500, 0.3, 0.2, 0.5)
#dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
test_path = './datasets/twitter/test/sst.test.feature.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
dataset.training = dataset.training.sampling(500, 0.3, 0.4, 0.3)
dataset.test = dataset.test.sampling(500, 0.2, 0.5, 0.3)
# training a quantifier
learner = LogisticRegression()
@ -30,17 +32,23 @@ learner = LogisticRegression()
# q = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
# q = qp.method.aggregative.ExplicitLossMinimisation(svmperf_base='./svm_perf_quantification', loss='q', verbose=0, C=1000)
# q = qp.method.aggregative.SVMQ(svmperf_base='./svm_perf_quantification', verbose=0, C=1000)
q = qp.method.aggregative.HDy(learner)
q.fit(dataset.training)
#model = qp.method.aggregative.HDy(learner)
#
model = qp.method.aggregative.HDy(learner)
model = OneVsAll(model)
print(model.get_params())
model.fit(dataset.training)
# estimating class prevalences
prevalences_estim = q.quantify(dataset.test.instances)
prevalences_estim = model.quantify(dataset.test.instances)
prevalences_true = dataset.test.prevalence()
# evaluation (one single prediction)
error = qp.error.mae(prevalences_true, prevalences_estim)
print(f'method {q.__class__.__name__}')
print(f'method {model.__class__.__name__}')
print(f'true prevalence {F.strprev(prevalences_true)}')
print(f'estim prevalence {F.strprev(prevalences_estim)}')
print(f'MAE={error:.3f}')