From 7bed93dcbf17014795a0c25f7d2c44ecfe6cecc1 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Tue, 22 Dec 2020 17:43:23 +0100 Subject: [PATCH] added model selection for quantification --- quapy/data/base.py | 23 +++++++--- quapy/data/datasets.py | 59 +++++++++++++++++++++----- quapy/evaluation.py | 11 ++--- quapy/method/aggregative.py | 84 ++++++++++++++++++++++++++++--------- quapy/util.py | 18 +++++++- test.py | 54 +++++++++++++++++++----- 6 files changed, 196 insertions(+), 53 deletions(-) diff --git a/quapy/data/base.py b/quapy/data/base.py index 45f9a76..eec4d80 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -4,11 +4,19 @@ from sklearn.model_selection import train_test_split from quapy.functional import artificial_prevalence_sampling from scipy.sparse import vstack +from util import temp_seed + class LabelledCollection: def __init__(self, instances, labels, n_classes=None): - self.instances = instances if issparse(instances) else np.asarray(instances) + if issparse(instances): + self.instances = instances + elif isinstance(instances, list) and len(instances)>0 and isinstance(instances[0], str): + # lists of strings occupy too much as ndarrays (although python-objects add a heavy overload) + self.instances = np.asarray(instances, dtype=object) + else: + self.instances = np.asarray(instances) self.labels = np.asarray(labels, dtype=int) n_docs = len(self) if n_classes is None: @@ -87,6 +95,7 @@ class LabelledCollection: return LabelledCollection(documents, labels, n_classes=self.n_classes) def split_stratified(self, train_prop=0.6): + # with temp_seed(42): tr_docs, te_docs, tr_labels, te_labels = \ train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) @@ -110,14 +119,16 @@ class LabelledCollection: yield self.uniform_sampling_index(sample_size) def __add__(self, other): - if issparse(self.instances) and issparse(other.documents): - docs = vstack([self.instances, other.documents]) - elif isinstance(self.instances, list) and isinstance(other.documents, list): - docs = self.instances + other.documents + if issparse(self.instances) and issparse(other.instances): + join_instances = vstack([self.instances, other.instances]) + elif isinstance(self.instances, list) and isinstance(other.instances, list): + join_instances = self.instances + other.instances + elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray): + join_instances = np.concatenate([self.instances, other.instances]) else: raise NotImplementedError('unsupported operation for collection types') labels = np.concatenate([self.labels, other.labels]) - return LabelledCollection(docs, labels) + return LabelledCollection(join_instances, labels) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 9b486ed..e2def20 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -1,5 +1,5 @@ import zipfile -from util import download_file_if_not_exists, download_file, get_quapy_home +from util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource import os from os.path import join from data.base import Dataset @@ -12,7 +12,21 @@ TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'sem 'sst', 'wa', 'wb'] -def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None): +def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False): + """ + Load a Reviews dataset as a Dataset instance, as used in: + Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." + Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. + :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb' + :param tfidf: set to True to transform the raw documents into tfidf weighted matrices + :param min_df: minimun number of documents that should contain a term in order for the term to be + kept (ignored if tfidf==False) + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for + faster subsequent invokations + :return: a Dataset instance + """ assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \ f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \ f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}' @@ -27,18 +41,37 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None): download_file_if_not_exists(URL_TRAIN, train_path) download_file_if_not_exists(URL_TEST, test_path) - data = Dataset.load(train_path, test_path, from_text) + pickle_path = None + if pickle: + pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl') + data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text) if tfidf: text2tfidf(data, inplace=True) - - if min_df is not None: - reduce_columns(data, min_df=min_df, inplace=True) + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) return data -def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None): +def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False): + """ + Load a Twitter dataset as a Dataset instance, as used in: + Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. + Social Network Analysis and Mining6(19), 1–22 (2016) + + :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13', + 'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb' + :param for_model_selection: if True, then returns the train split as the training set and the devel split + as the test set; if False, then returns the train+devel split as the training set and the test set as the + test set + :param min_df: minimun number of documents that should contain a term in order for the term to be kept + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for + faster subsequent invokations + :return: a Dataset instance + """ assert dataset_name in TWITTER_SENTIMENT_DATASETS, \ f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \ f'Valid ones are {TWITTER_SENTIMENT_DATASETS}' @@ -56,23 +89,27 @@ def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=No if dataset_name in {'semeval13', 'semeval14', 'semeval15'}: trainset_name = 'semeval' - testset_name = 'semeval' if model_selection else dataset_name + testset_name = 'semeval' if for_model_selection else dataset_name print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common " f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}") else: trainset_name = testset_name = dataset_name - if model_selection: + if for_model_selection: train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt') test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt') else: train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt') - if dataset_name == 'semeval16': + if dataset_name == 'semeval16': # there is a different test name in the case of semeval16 only test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt') else: test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt') - data = Dataset.load(train, test, from_sparse) + pickle_path = None + if pickle: + mode = "train-dev" if for_model_selection else "train+dev-test" + pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl') + data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse) if min_df is not None: reduce_columns(data, min_df=min_df, inplace=True) diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 8003c38..c29f79c 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -14,7 +14,9 @@ def artificial_sampling_prediction( n_prevpoints=210, n_repetitions=1, n_jobs=-1, - random_seed=42): + random_seed=42, + verbose=True +): """ Performs the predictions for all samples generated according to the artificial sampling protocol. :param model: the model in charge of generating the class prevalence estimations @@ -25,6 +27,7 @@ def artificial_sampling_prediction( :param n_jobs: number of jobs to be run in parallel :param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect any other random process. + :param verbose: if True, shows a progress bar :return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the number of classes. The first one contains the true prevalences for the samples generated while the second one containing the the prevalences estimations @@ -36,15 +39,12 @@ def artificial_sampling_prediction( if isinstance(model, AggregativeQuantifier): quantification_func = model.aggregate if isinstance(model, AggregativeProbabilisticQuantifier): - print('\tpreclassifying with soft') preclassified_instances = model.posterior_probabilities(test.instances) else: - print('\tpreclassifying with hard') preclassified_instances = model.classify(test.instances) test = LabelledCollection(preclassified_instances, test.labels) else: quantification_func = model.quantify - print('not an aggregative') def _predict_prevalences(index): sample = test.sampling_from_index(index) @@ -52,8 +52,9 @@ def artificial_sampling_prediction( estim_prevalence = quantification_func(sample.instances) return true_prevalence, estim_prevalence + pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes results = Parallel(n_jobs=n_jobs)( - delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting') + delayed(_predict_prevalences)(index) for index in pbar ) true_prevalences, estim_prevalences = zip(*results) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 92918f5..3b18090 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -3,12 +3,13 @@ from copy import deepcopy import functional as F import error from method.base import BaseQuantifier -from quapy.classification.svmperf import SVMperf -from quapy.data import LabelledCollection +from classification.svmperf import SVMperf +from data import LabelledCollection from sklearn.metrics import confusion_matrix from sklearn.calibration import CalibratedClassifierCV from joblib import Parallel, delayed from abc import abstractmethod +from typing import Union # Abstract classes @@ -77,13 +78,19 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): self.learner.set_params(**parameters) +class BinaryQuantifier(BaseQuantifier): + def _check_binary(self, data : LabelledCollection, quantifier_name): + assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \ + f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.' + + # Helper # ------------------------------------ def training_helper(learner, data: LabelledCollection, fit_learner: bool = True, ensure_probabilistic=False, - train_val_split=None): + val_split:Union[LabelledCollection, float]=None): """ Training procedure common to all Aggregative Quantifiers. :param learner: the learner to be fit @@ -91,7 +98,9 @@ def training_helper(learner, :param fit_learner: whether or not to fit the learner (if False, then bypasses any action) :param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the learner is not probabilistic, then a CalibratedCV instance of it is trained) - :param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner + :param val_split: if specified as a float, indicates the proportion of training instances that will define the + validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a + LabelledCollection, represents the validation split itself :return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0 or None otherwise) to be used as a validation set for any subsequent parameter fitting """ @@ -101,10 +110,17 @@ def training_helper(learner, print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. ' f'The learner will be calibrated.') learner = CalibratedClassifierCV(learner, cv=5) - if train_val_split is not None: - if not (0 < train_val_split < 1): - raise ValueError(f'train/val split {train_val_split} out of range, must be in (0,1)') - train, unused = data.split_stratified(train_prop=train_val_split) + if val_split is not None: + if isinstance(val_split, float): + if not (0 < val_split < 1): + raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)') + train, unused = data.split_stratified(train_prop=1-val_split) + elif isinstance(val_split, LabelledCollection): + train = data + unused = val_split + else: + raise ValueError('train_val_split not understood; use either a float indicating the split proportion, ' + 'or a LabelledCollection indicating the validation split') else: train, unused = data, None learner.fit(train.instances, train.labels) @@ -148,8 +164,17 @@ class AdjustedClassifyAndCount(AggregativeQuantifier): def __init__(self, learner): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): - self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split) + def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): + """ + Trains a ACC quantifier + :param data: the training set + :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) + :param val_split: either a float in (0,1) indicating the proportion of training instances to use for + validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection + indicating the validation set itself + :return: self + """ + self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split) self.cc = ClassifyAndCount(self.learner) y_ = self.classify(validation.instances) y = validation.labels @@ -196,9 +221,18 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier): def __init__(self, learner): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): + def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): + """ + Trains a PACC quantifier + :param data: the training set + :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) + :param val_split: either a float in (0,1) indicating the proportion of training instances to use for + validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection + indicating the validation set itself + :return: self + """ self.learner, validation = training_helper( - self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split + self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split ) self.pcc = ProbabilisticClassifyAndCount(self.learner) y_ = self.classify(validation.instances) @@ -262,7 +296,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier): return qs -class HellingerDistanceY(AggregativeProbabilisticQuantifier): +class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier): """ Implementation of the method based on the Hellinger Distance y (HDy) proposed by González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution @@ -272,11 +306,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier): def __init__(self, learner): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6): - assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \ - f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.' + def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): + """ + Trains a HDy quantifier + :param data: the training set + :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) + :param val_split: either a float in (0,1) indicating the proportion of training instances to use for + validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection + indicating the validation set itself + :return: self + """ + self._check_binary(data, self.__class__.__name__) self.learner, validation = training_helper( - self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split) + self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) Px = self.posterior_probabilities(validation.instances) self.Pxy1 = Px[validation.labels == 1] self.Pxy0 = Px[validation.labels == 0] @@ -312,7 +354,7 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier): return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2)) -class ExplicitLossMinimisation(AggregativeQuantifier): +class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier): def __init__(self, svmperf_base, loss, **kwargs): self.svmperf_base = svmperf_base @@ -320,8 +362,7 @@ class ExplicitLossMinimisation(AggregativeQuantifier): self.kwargs = kwargs def fit(self, data: LabelledCollection, fit_learner=True, *args): - assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' \ - f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.' + self._check_binary(data, self.__class__.__name__) assert fit_learner, 'the method requires that fit_learner=True' self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels) return self @@ -386,6 +427,9 @@ class OneVsAll(AggregativeQuantifier): f'{self.__class__.__name__} expect non-binary data' assert isinstance(self.binary_quantifier, BaseQuantifier), \ f'{self.binary_quantifier} does not seem to be a Quantifier' + if not isinstance(self.binary_quantifier, BinaryQuantifier): + raise ValueError(f'{self.binary_quantifier.__class__.__name__} does not seem to be an instance of ' + f'{BinaryQuantifier.__class__.__name__}') self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_} self.__parallel(self._delayed_binary_fit, data, **kwargs) return self diff --git a/quapy/util.py b/quapy/util.py index d9430c8..899519a 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -6,6 +6,7 @@ import numpy as np import urllib import os from pathlib import Path +import pickle def get_parallel_slices(n_tasks, n_jobs=-1): @@ -58,4 +59,19 @@ def create_if_not_exist(path): def get_quapy_home(): - return os.path.join(str(Path.home()), 'quapy_data') \ No newline at end of file + home = os.path.join(str(Path.home()), 'quapy_data') + os.makedirs(home, exist_ok=True) + return home + + +def pickled_resource(pickle_path:str, generation_func:callable, *args): + if pickle_path is None: + return generation_func(*args) + else: + if os.path.exists(pickle_path): + return pickle.load(open(pickle_path, 'rb')) + else: + instance = generation_func(*args) + os.makedirs(str(Path(pickle_path).parent), exist_ok=True) + pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) + return instance diff --git a/test.py b/test.py index 85d8bb6..0ade026 100644 --- a/test.py +++ b/test.py @@ -3,11 +3,13 @@ from sklearn.svm import LinearSVC import quapy as qp import quapy.functional as F import sys +import numpy as np #qp.datasets.fetch_reviews('hp') #qp.datasets.fetch_twitter('sst') #sys.exit() +from model_selection import GridSearchQ SAMPLE_SIZE=500 binary = False @@ -17,27 +19,31 @@ if binary: dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) else: - dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10) - dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3) + dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True) + # dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3) print('dataset loaded') # training a quantifier -learner = LogisticRegression() +learner = LogisticRegression(max_iter=1000) # model = qp.method.aggregative.ClassifyAndCount(learner) -# model = qp.method.aggregative.AdjustedClassifyAndCount(learner) +model = qp.method.aggregative.AdjustedClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner) # model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner) # model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner) # model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100) -model = qp.method.aggregative.SVMQ(svmperf_home, C=1) +# model = qp.method.aggregative.SVMQ(svmperf_home, C=1) -if not binary: +if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier): model = qp.method.aggregative.OneVsAll(model) -print('fitting model') -model.fit(dataset.training) +# Model fit and Evaluation on the test data +# ---------------------------------------------------------------------------- + +print(f'fitting model {model.__class__.__name__}') +train, val = dataset.training.split_stratified(0.6) +model.fit(train, val_split=val) # estimating class prevalences print('quantifying') @@ -47,14 +53,15 @@ prevalences_true = dataset.test.prevalence() # evaluation (one single prediction) error = qp.error.mae(prevalences_true, prevalences_estim) -print(f'method {model.__class__.__name__}') - print(f'Evaluation in test (1 eval)') print(f'true prevalence {F.strprev(prevalences_true)}') print(f'estim prevalence {F.strprev(prevalences_estim)}') print(f'mae={error:.3f}') +# Model fit and Evaluation according to the artificial sampling protocol +# ---------------------------------------------------------------------------- + max_evaluations = 5000 n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes) n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes) @@ -70,3 +77,30 @@ for error in qp.error.QUANTIFICATION_ERROR: score = error(true_prev, estim_prev) print(f'{error.__name__}={score:.5f}') + +# Model selection and Evaluation according to the artificial sampling protocol +# ---------------------------------------------------------------------------- + +param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]} + +model_selection = GridSearchQ(model, + param_grid=param_grid, + sample_size=SAMPLE_SIZE, + eval_budget=max_evaluations//10, + error='mae', + refit=True, + verbose=True) + +# model = model_selection.fit(dataset.training, validation=0.3) +model = model_selection.fit(train, validation=val) +print(f'Model selection: best_params = {model_selection.best_params_}') +print(f'param scores:') +for params, score in model_selection.param_scores_.items(): + print(f'\t{params}: {score:.5f}') + +true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints) + +print(f'After model selection: Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)') +for error in qp.error.QUANTIFICATION_ERROR: + score = error(true_prev, estim_prev) + print(f'{error.__name__}={score:.5f}') \ No newline at end of file