added model selection for quantification

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-22 17:43:23 +01:00
parent 7d6f523e4b
commit 7bed93dcbf
6 changed files with 196 additions and 53 deletions

View File

@ -4,11 +4,19 @@ from sklearn.model_selection import train_test_split
from quapy.functional import artificial_prevalence_sampling from quapy.functional import artificial_prevalence_sampling
from scipy.sparse import vstack from scipy.sparse import vstack
from util import temp_seed
class LabelledCollection: class LabelledCollection:
def __init__(self, instances, labels, n_classes=None): def __init__(self, instances, labels, n_classes=None):
self.instances = instances if issparse(instances) else np.asarray(instances) if issparse(instances):
self.instances = instances
elif isinstance(instances, list) and len(instances)>0 and isinstance(instances[0], str):
# lists of strings occupy too much as ndarrays (although python-objects add a heavy overload)
self.instances = np.asarray(instances, dtype=object)
else:
self.instances = np.asarray(instances)
self.labels = np.asarray(labels, dtype=int) self.labels = np.asarray(labels, dtype=int)
n_docs = len(self) n_docs = len(self)
if n_classes is None: if n_classes is None:
@ -87,6 +95,7 @@ class LabelledCollection:
return LabelledCollection(documents, labels, n_classes=self.n_classes) return LabelledCollection(documents, labels, n_classes=self.n_classes)
def split_stratified(self, train_prop=0.6): def split_stratified(self, train_prop=0.6):
# with temp_seed(42):
tr_docs, te_docs, tr_labels, te_labels = \ tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels) train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels)
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
@ -110,14 +119,16 @@ class LabelledCollection:
yield self.uniform_sampling_index(sample_size) yield self.uniform_sampling_index(sample_size)
def __add__(self, other): def __add__(self, other):
if issparse(self.instances) and issparse(other.documents): if issparse(self.instances) and issparse(other.instances):
docs = vstack([self.instances, other.documents]) join_instances = vstack([self.instances, other.instances])
elif isinstance(self.instances, list) and isinstance(other.documents, list): elif isinstance(self.instances, list) and isinstance(other.instances, list):
docs = self.instances + other.documents join_instances = self.instances + other.instances
elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray):
join_instances = np.concatenate([self.instances, other.instances])
else: else:
raise NotImplementedError('unsupported operation for collection types') raise NotImplementedError('unsupported operation for collection types')
labels = np.concatenate([self.labels, other.labels]) labels = np.concatenate([self.labels, other.labels])
return LabelledCollection(docs, labels) return LabelledCollection(join_instances, labels)

View File

@ -1,5 +1,5 @@
import zipfile import zipfile
from util import download_file_if_not_exists, download_file, get_quapy_home from util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
import os import os
from os.path import join from os.path import join
from data.base import Dataset from data.base import Dataset
@ -12,7 +12,21 @@ TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'sem
'sst', 'wa', 'wb'] 'sst', 'wa', 'wb']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None): def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
"""
Load a Reviews dataset as a Dataset instance, as used in:
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
:param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
:param min_df: minimun number of documents that should contain a term in order for the term to be
kept (ignored if tfidf==False)
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance
"""
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \ assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \ f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}' f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
@ -27,18 +41,37 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
download_file_if_not_exists(URL_TRAIN, train_path) download_file_if_not_exists(URL_TRAIN, train_path)
download_file_if_not_exists(URL_TEST, test_path) download_file_if_not_exists(URL_TEST, test_path)
data = Dataset.load(train_path, test_path, from_text) pickle_path = None
if pickle:
pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)
if tfidf: if tfidf:
text2tfidf(data, inplace=True) text2tfidf(data, inplace=True)
if min_df is not None:
if min_df is not None: reduce_columns(data, min_df=min_df, inplace=True)
reduce_columns(data, min_df=min_df, inplace=True)
return data return data
def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None): def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False):
"""
Load a Twitter dataset as a Dataset instance, as used in:
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
:param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
:param for_model_selection: if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance
"""
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \ assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \ f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}' f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
@ -56,23 +89,27 @@ def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=No
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}: if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
trainset_name = 'semeval' trainset_name = 'semeval'
testset_name = 'semeval' if model_selection else dataset_name testset_name = 'semeval' if for_model_selection else dataset_name
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common " print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}") f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
else: else:
trainset_name = testset_name = dataset_name trainset_name = testset_name = dataset_name
if model_selection: if for_model_selection:
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt') train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt') test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
else: else:
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt') train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
if dataset_name == 'semeval16': if dataset_name == 'semeval16': # there is a different test name in the case of semeval16 only
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt') test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
else: else:
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt') test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
data = Dataset.load(train, test, from_sparse) pickle_path = None
if pickle:
mode = "train-dev" if for_model_selection else "train+dev-test"
pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)
if min_df is not None: if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True) reduce_columns(data, min_df=min_df, inplace=True)

View File

@ -14,7 +14,9 @@ def artificial_sampling_prediction(
n_prevpoints=210, n_prevpoints=210,
n_repetitions=1, n_repetitions=1,
n_jobs=-1, n_jobs=-1,
random_seed=42): random_seed=42,
verbose=True
):
""" """
Performs the predictions for all samples generated according to the artificial sampling protocol. Performs the predictions for all samples generated according to the artificial sampling protocol.
:param model: the model in charge of generating the class prevalence estimations :param model: the model in charge of generating the class prevalence estimations
@ -25,6 +27,7 @@ def artificial_sampling_prediction(
:param n_jobs: number of jobs to be run in parallel :param n_jobs: number of jobs to be run in parallel
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
any other random process. any other random process.
:param verbose: if True, shows a progress bar
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the :return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
number of classes. The first one contains the true prevalences for the samples generated while the second one number of classes. The first one contains the true prevalences for the samples generated while the second one
containing the the prevalences estimations containing the the prevalences estimations
@ -36,15 +39,12 @@ def artificial_sampling_prediction(
if isinstance(model, AggregativeQuantifier): if isinstance(model, AggregativeQuantifier):
quantification_func = model.aggregate quantification_func = model.aggregate
if isinstance(model, AggregativeProbabilisticQuantifier): if isinstance(model, AggregativeProbabilisticQuantifier):
print('\tpreclassifying with soft')
preclassified_instances = model.posterior_probabilities(test.instances) preclassified_instances = model.posterior_probabilities(test.instances)
else: else:
print('\tpreclassifying with hard')
preclassified_instances = model.classify(test.instances) preclassified_instances = model.classify(test.instances)
test = LabelledCollection(preclassified_instances, test.labels) test = LabelledCollection(preclassified_instances, test.labels)
else: else:
quantification_func = model.quantify quantification_func = model.quantify
print('not an aggregative')
def _predict_prevalences(index): def _predict_prevalences(index):
sample = test.sampling_from_index(index) sample = test.sampling_from_index(index)
@ -52,8 +52,9 @@ def artificial_sampling_prediction(
estim_prevalence = quantification_func(sample.instances) estim_prevalence = quantification_func(sample.instances)
return true_prevalence, estim_prevalence return true_prevalence, estim_prevalence
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
results = Parallel(n_jobs=n_jobs)( results = Parallel(n_jobs=n_jobs)(
delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting') delayed(_predict_prevalences)(index) for index in pbar
) )
true_prevalences, estim_prevalences = zip(*results) true_prevalences, estim_prevalences = zip(*results)

View File

@ -3,12 +3,13 @@ from copy import deepcopy
import functional as F import functional as F
import error import error
from method.base import BaseQuantifier from method.base import BaseQuantifier
from quapy.classification.svmperf import SVMperf from classification.svmperf import SVMperf
from quapy.data import LabelledCollection from data import LabelledCollection
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.calibration import CalibratedClassifierCV from sklearn.calibration import CalibratedClassifierCV
from joblib import Parallel, delayed from joblib import Parallel, delayed
from abc import abstractmethod from abc import abstractmethod
from typing import Union
# Abstract classes # Abstract classes
@ -77,13 +78,19 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
self.learner.set_params(**parameters) self.learner.set_params(**parameters)
class BinaryQuantifier(BaseQuantifier):
def _check_binary(self, data : LabelledCollection, quantifier_name):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
# Helper # Helper
# ------------------------------------ # ------------------------------------
def training_helper(learner, def training_helper(learner,
data: LabelledCollection, data: LabelledCollection,
fit_learner: bool = True, fit_learner: bool = True,
ensure_probabilistic=False, ensure_probabilistic=False,
train_val_split=None): val_split:Union[LabelledCollection, float]=None):
""" """
Training procedure common to all Aggregative Quantifiers. Training procedure common to all Aggregative Quantifiers.
:param learner: the learner to be fit :param learner: the learner to be fit
@ -91,7 +98,9 @@ def training_helper(learner,
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action) :param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
learner is not probabilistic, then a CalibratedCV instance of it is trained) learner is not probabilistic, then a CalibratedCV instance of it is trained)
:param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner :param val_split: if specified as a float, indicates the proportion of training instances that will define the
validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a
LabelledCollection, represents the validation split itself
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
or None otherwise) to be used as a validation set for any subsequent parameter fitting or None otherwise) to be used as a validation set for any subsequent parameter fitting
""" """
@ -101,10 +110,17 @@ def training_helper(learner,
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. ' print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
f'The learner will be calibrated.') f'The learner will be calibrated.')
learner = CalibratedClassifierCV(learner, cv=5) learner = CalibratedClassifierCV(learner, cv=5)
if train_val_split is not None: if val_split is not None:
if not (0 < train_val_split < 1): if isinstance(val_split, float):
raise ValueError(f'train/val split {train_val_split} out of range, must be in (0,1)') if not (0 < val_split < 1):
train, unused = data.split_stratified(train_prop=train_val_split) raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
train, unused = data.split_stratified(train_prop=1-val_split)
elif isinstance(val_split, LabelledCollection):
train = data
unused = val_split
else:
raise ValueError('train_val_split not understood; use either a float indicating the split proportion, '
'or a LabelledCollection indicating the validation split')
else: else:
train, unused = data, None train, unused = data, None
learner.fit(train.instances, train.labels) learner.fit(train.instances, train.labels)
@ -148,8 +164,17 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
def __init__(self, learner): def __init__(self, learner):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split) """
Trains a ACC quantifier
:param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself
:return: self
"""
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
self.cc = ClassifyAndCount(self.learner) self.cc = ClassifyAndCount(self.learner)
y_ = self.classify(validation.instances) y_ = self.classify(validation.instances)
y = validation.labels y = validation.labels
@ -196,9 +221,18 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
def __init__(self, learner): def __init__(self, learner):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
"""
Trains a PACC quantifier
:param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself
:return: self
"""
self.learner, validation = training_helper( self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
) )
self.pcc = ProbabilisticClassifyAndCount(self.learner) self.pcc = ProbabilisticClassifyAndCount(self.learner)
y_ = self.classify(validation.instances) y_ = self.classify(validation.instances)
@ -262,7 +296,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
return qs return qs
class HellingerDistanceY(AggregativeProbabilisticQuantifier): class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
""" """
Implementation of the method based on the Hellinger Distance y (HDy) proposed by Implementation of the method based on the Hellinger Distance y (HDy) proposed by
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
@ -272,11 +306,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
def __init__(self, learner): def __init__(self, learner):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \ """
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.' Trains a HDy quantifier
:param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself
:return: self
"""
self._check_binary(data, self.__class__.__name__)
self.learner, validation = training_helper( self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
Px = self.posterior_probabilities(validation.instances) Px = self.posterior_probabilities(validation.instances)
self.Pxy1 = Px[validation.labels == 1] self.Pxy1 = Px[validation.labels == 1]
self.Pxy0 = Px[validation.labels == 0] self.Pxy0 = Px[validation.labels == 0]
@ -312,7 +354,7 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2)) return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
class ExplicitLossMinimisation(AggregativeQuantifier): class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier):
def __init__(self, svmperf_base, loss, **kwargs): def __init__(self, svmperf_base, loss, **kwargs):
self.svmperf_base = svmperf_base self.svmperf_base = svmperf_base
@ -320,8 +362,7 @@ class ExplicitLossMinimisation(AggregativeQuantifier):
self.kwargs = kwargs self.kwargs = kwargs
def fit(self, data: LabelledCollection, fit_learner=True, *args): def fit(self, data: LabelledCollection, fit_learner=True, *args):
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' \ self._check_binary(data, self.__class__.__name__)
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
assert fit_learner, 'the method requires that fit_learner=True' assert fit_learner, 'the method requires that fit_learner=True'
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels) self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
return self return self
@ -386,6 +427,9 @@ class OneVsAll(AggregativeQuantifier):
f'{self.__class__.__name__} expect non-binary data' f'{self.__class__.__name__} expect non-binary data'
assert isinstance(self.binary_quantifier, BaseQuantifier), \ assert isinstance(self.binary_quantifier, BaseQuantifier), \
f'{self.binary_quantifier} does not seem to be a Quantifier' f'{self.binary_quantifier} does not seem to be a Quantifier'
if not isinstance(self.binary_quantifier, BinaryQuantifier):
raise ValueError(f'{self.binary_quantifier.__class__.__name__} does not seem to be an instance of '
f'{BinaryQuantifier.__class__.__name__}')
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data, **kwargs) self.__parallel(self._delayed_binary_fit, data, **kwargs)
return self return self

View File

@ -6,6 +6,7 @@ import numpy as np
import urllib import urllib
import os import os
from pathlib import Path from pathlib import Path
import pickle
def get_parallel_slices(n_tasks, n_jobs=-1): def get_parallel_slices(n_tasks, n_jobs=-1):
@ -58,4 +59,19 @@ def create_if_not_exist(path):
def get_quapy_home(): def get_quapy_home():
return os.path.join(str(Path.home()), 'quapy_data') home = os.path.join(str(Path.home()), 'quapy_data')
os.makedirs(home, exist_ok=True)
return home
def pickled_resource(pickle_path:str, generation_func:callable, *args):
if pickle_path is None:
return generation_func(*args)
else:
if os.path.exists(pickle_path):
return pickle.load(open(pickle_path, 'rb'))
else:
instance = generation_func(*args)
os.makedirs(str(Path(pickle_path).parent), exist_ok=True)
pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
return instance

54
test.py
View File

@ -3,11 +3,13 @@ from sklearn.svm import LinearSVC
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
import sys import sys
import numpy as np
#qp.datasets.fetch_reviews('hp') #qp.datasets.fetch_reviews('hp')
#qp.datasets.fetch_twitter('sst') #qp.datasets.fetch_twitter('sst')
#sys.exit() #sys.exit()
from model_selection import GridSearchQ
SAMPLE_SIZE=500 SAMPLE_SIZE=500
binary = False binary = False
@ -17,27 +19,31 @@ if binary:
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
else: else:
dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10) dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True)
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3) # dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
print('dataset loaded') print('dataset loaded')
# training a quantifier # training a quantifier
learner = LogisticRegression() learner = LogisticRegression(max_iter=1000)
# model = qp.method.aggregative.ClassifyAndCount(learner) # model = qp.method.aggregative.ClassifyAndCount(learner)
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner) model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) # model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100) # model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
model = qp.method.aggregative.SVMQ(svmperf_home, C=1) # model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
if not binary: if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier):
model = qp.method.aggregative.OneVsAll(model) model = qp.method.aggregative.OneVsAll(model)
print('fitting model')
model.fit(dataset.training)
# Model fit and Evaluation on the test data
# ----------------------------------------------------------------------------
print(f'fitting model {model.__class__.__name__}')
train, val = dataset.training.split_stratified(0.6)
model.fit(train, val_split=val)
# estimating class prevalences # estimating class prevalences
print('quantifying') print('quantifying')
@ -47,14 +53,15 @@ prevalences_true = dataset.test.prevalence()
# evaluation (one single prediction) # evaluation (one single prediction)
error = qp.error.mae(prevalences_true, prevalences_estim) error = qp.error.mae(prevalences_true, prevalences_estim)
print(f'method {model.__class__.__name__}')
print(f'Evaluation in test (1 eval)') print(f'Evaluation in test (1 eval)')
print(f'true prevalence {F.strprev(prevalences_true)}') print(f'true prevalence {F.strprev(prevalences_true)}')
print(f'estim prevalence {F.strprev(prevalences_estim)}') print(f'estim prevalence {F.strprev(prevalences_estim)}')
print(f'mae={error:.3f}') print(f'mae={error:.3f}')
# Model fit and Evaluation according to the artificial sampling protocol
# ----------------------------------------------------------------------------
max_evaluations = 5000 max_evaluations = 5000
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes) n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes) n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
@ -70,3 +77,30 @@ for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev) score = error(true_prev, estim_prev)
print(f'{error.__name__}={score:.5f}') print(f'{error.__name__}={score:.5f}')
# Model selection and Evaluation according to the artificial sampling protocol
# ----------------------------------------------------------------------------
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
model_selection = GridSearchQ(model,
param_grid=param_grid,
sample_size=SAMPLE_SIZE,
eval_budget=max_evaluations//10,
error='mae',
refit=True,
verbose=True)
# model = model_selection.fit(dataset.training, validation=0.3)
model = model_selection.fit(train, validation=val)
print(f'Model selection: best_params = {model_selection.best_params_}')
print(f'param scores:')
for params, score in model_selection.param_scores_.items():
print(f'\t{params}: {score:.5f}')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
print(f'After model selection: Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev)
print(f'{error.__name__}={score:.5f}')