From 03cf73aff6396c3aef893eb09695c427a7183684 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 22 Jan 2021 18:01:51 +0100 Subject: [PATCH] refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size --- README.md | 33 ++++++++- TODO.txt | 3 +- TweetSentQuant/experiments.py | 106 +++++++++++++++++----------- TweetSentQuant/settings.py | 5 +- quapy/data/base.py | 4 +- quapy/data/datasets.py | 127 +++++++++++++++++++++------------- quapy/evaluation.py | 1 - quapy/method/aggregative.py | 30 +++++--- quapy/method/meta.py | 66 +++++++++++------- quapy/model_selection.py | 8 ++- test.py | 37 +++++++--- 11 files changed, 277 insertions(+), 143 deletions(-) diff --git a/README.md b/README.md index aed5f1b..c1b7b00 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,34 @@ # QuaPy -A Quantification framework written in Python. \ No newline at end of file +QuaPy is an open source framework for Quantification (a.k.a. Supervised Prevalence Estimation) +written in Python. + +QuaPy roots on the concept of data sample, and provides implementations of +most important concepts in quantification literature, such as the most important +quantification baselines, many advanced quantification methods, +quantification-oriented model selection, many evaluation measures and protocols +used for evaluating quantification methods. +QuaPy also integrates commonly used datasets and offers visualization tools +for facilitating the analysis and interpretation of results. + +```python +import quapy as qp +from sklearn.linear_model import LogisticRegression + +dataset = qp.datasets.fetch_twitter('semeval16') + +# create an "Adjusted Classify & Count" quantifier +model = qp.method.aggregative.ACC(LogisticRegression()) +model.fit(dataset.training) + +prevalences_estim = model.quantify(dataset.test.instances) +prevalences_true = dataset.test.prevalence() + +error = qp.error.mae(prevalences_true, prevalences_estim) + +print(f'MAE={error:.3f}') +``` + +binary, and single-label + + diff --git a/TODO.txt b/TODO.txt index 16de883..8fd1ff5 100644 --- a/TODO.txt +++ b/TODO.txt @@ -24,4 +24,5 @@ Implement HDy for single-label? Rename EMQ to SLD ? How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up to one always? -Parallelize the kFCV in ACC and PACC \ No newline at end of file +Parallelize the kFCV in ACC and PACC +Requirements: xlrd for reading excel \ No newline at end of file diff --git a/TweetSentQuant/experiments.py b/TweetSentQuant/experiments.py index 4fe6362..84269f5 100644 --- a/TweetSentQuant/experiments.py +++ b/TweetSentQuant/experiments.py @@ -20,49 +20,64 @@ import shutil DEBUG = False +def newLR(): + return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) + +__C_range = np.logspace(-4, 5, 10) +lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']} +svmperf_params = {'C': __C_range} + def quantification_models(): - def newLR(): - return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) - __C_range = np.logspace(-4, 5, 10) - lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']} - svmperf_params = {'C': __C_range} - # methods tested in Gao & Sebastiani 2016 - # yield 'cc', CC(newLR()), lr_params - # yield 'acc', ACC(newLR()), lr_params - # yield 'pcc', PCC(newLR()), lr_params - # yield 'pacc', PACC(newLR()), lr_params - # yield 'sld', EMQ(newLR()), lr_params - # yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params - # yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params - # yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params - # - # # methods added - # yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params - # yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params - # yield 'hdy', OneVsAll(HDy(newLR())), lr_params + yield 'cc', CC(newLR()), lr_params + yield 'acc', ACC(newLR()), lr_params + yield 'pcc', PCC(newLR()), lr_params + yield 'pacc', PACC(newLR()), lr_params + yield 'sld', EMQ(newLR()), lr_params + yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params + yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params + yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params + # methods added + yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params + yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params + yield 'hdy', OneVsAll(HDy(newLR())), lr_params + + +def quantification_cuda_models(): device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f'Running QuaNet in {device}') - if DEBUG: - lr_params={'C':[1,10]} - yield 'quanet', QuaNet(PCALR(**newLR().get_params()), settings.SAMPLE_SIZE, - lstm_hidden_size=32, lstm_nlayers=1, - tr_iter_per_poch=50, va_iter_per_poch=10, - patience=3, - checkpointdir=args.checkpointdir, device=device), lr_params - else: - yield 'quanet', QuaNet(PCALR(**newLR().get_params()), settings.SAMPLE_SIZE, - checkpointdir=args.checkpointdir, device=device), lr_params + learner = PCALR(**newLR().get_params()) + yield 'quanet', QuaNet(learner, settings.SAMPLE_SIZE, checkpointdir=args.checkpointdir, device=device), lr_params - #param_mod_sel={'sample_size':settings.SAMPLE_SIZE, 'n_prevpoints':21, 'n_repetitions':5} - #yield 'epaccmaeptr', EPACC(newLR(), param_grid=lr_params, optim='mae', policy='ptr', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None - # yield 'epaccmraeptr', EPACC(newLR(), param_grid=lr_params, optim='mrae', policy='ptr', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None - # yield 'epaccmae', EPACC(newLR(), param_grid=lr_params, optim='mae', policy='mae', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None - # yield 'epaccmrae', EPACC(newLR(), param_grid=lr_params, optim='mrae', policy='mrae', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None +def quantification_ensembles(): + param_mod_sel = { + 'sample_size': settings.SAMPLE_SIZE, + 'n_prevpoints': 21, + 'n_repetitions': 5, + 'verbose': False + } + common={ + 'max_sample_size': 500, + 'n_jobs': settings.ENSEMBLE_N_JOBS, + 'param_grid': lr_params, + 'param_mod_sel': param_mod_sel, + 'val_split': 0.4 + } + + # hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection + # will be skipped (by setting hyperparameters to None) + hyper_none = None + yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none + yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none + yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none + yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none - #yield 'mlpe', MaximumLikelihoodPrevalenceEstimation(), {} + yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none + yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none + yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none + yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none def evaluate_experiment(true_prevalences, estim_prevalences): @@ -119,10 +134,7 @@ def run(experiment): benchmark_devel.stats() # model selection (hyperparameter optimization for a quantification-oriented loss) - if hyperparams is None: - model.fit(benchmark_devel.training, benchmark_devel.test) - best_params = {} - else: + if hyperparams is not None: model_selection = qp.model_selection.GridSearchQ( model, param_grid=hyperparams, @@ -137,6 +149,8 @@ def run(experiment): model_selection.fit(benchmark_devel.training, benchmark_devel.test) model = model_selection.best_model() best_params = model_selection.best_params_ + else: + best_params = {} # model evaluation test_names = [dataset_name] if dataset_name != 'semeval' else ['semeval13', 'semeval14', 'semeval15'] @@ -183,9 +197,19 @@ if __name__ == '__main__': optim_losses = ['mae'] # ['mae', 'mrae'] datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN - models = quantification_models() - results = Parallel(n_jobs=settings.N_JOBS)( + #models = quantification_models() + #Parallel(n_jobs=settings.N_JOBS)( + # delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models) + #) + + #models = quantification_cuda_models() + #Parallel(n_jobs=settings.CUDA_N_JOBS)( + # delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models) + #) + + models = quantification_ensembles() + Parallel(n_jobs=1)( delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models) ) diff --git a/TweetSentQuant/settings.py b/TweetSentQuant/settings.py index 6993c37..418385d 100644 --- a/TweetSentQuant/settings.py +++ b/TweetSentQuant/settings.py @@ -1,7 +1,8 @@ import multiprocessing -N_JOBS = 1 #multiprocessing.cpu_count() +N_JOBS = -2 #multiprocessing.cpu_count() +CUDA_N_JOBS = 1 ENSEMBLE_N_JOBS = -2 + SAMPLE_SIZE = 100 -assert N_JOBS==1 or ENSEMBLE_N_JOBS==1, 'general N_JOBS and ENSEMBLE_N_JOBS should not be both greater than 1' \ No newline at end of file diff --git a/quapy/data/base.py b/quapy/data/base.py index 697bcf6..0fed0d7 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -92,10 +92,10 @@ class LabelledCollection: labels = self.labels[index] return LabelledCollection(documents, labels, n_classes=self.n_classes) - def split_stratified(self, train_prop=0.6): + def split_stratified(self, train_prop=0.6, random_state=None): # with temp_seed(42): tr_docs, te_docs, tr_labels, te_labels = \ - train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels) + train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index dca22cd..aa839a2 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -1,6 +1,7 @@ import os import zipfile from os.path import join +from urllib.error import HTTPError import pandas as pd @@ -137,9 +138,11 @@ UCI_DATASETS = ['acute.a', 'acute.b', 'balance.1', 'balance.2', 'balance.3', 'breast-cancer', 'cmc.1', 'cmc.2', 'cmc.3', - 'ctg.1', 'ctg.2', 'ctg.3'] # ongoing... + 'ctg.1', 'ctg.2', 'ctg.3', + #'diabetes', # <-- I haven't found this one... + 'german'] # ongoing... -def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): +def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3): assert dataset_name in UCI_DATASETS, \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ @@ -147,22 +150,6 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): if data_home is None: data_home = get_quapy_home() - identifier_map = { - 'acute.a': 'acute', - 'acute.b': 'acute', - 'balance.1': 'balance-scale', - 'balance.2': 'balance-scale', - 'balance.3': 'balance-scale', - 'breast-cancer': 'breast-cancer-wisconsin', - 'cmc.1': 'cmc', - 'cmc.2': 'cmc', - 'cmc.3': 'cmc', - 'ctg.1': 'ctg', - 'ctg.2': 'ctg', - 'ctg.3': 'ctg', - - } - dataset_fullname = { 'acute.a': 'Acute Inflammations (urinary bladder)', 'acute.b': 'Acute Inflammations (renal pelvis)', @@ -176,27 +163,64 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): 'ctg.1': 'Cardiotocography Data Set (normal)', 'ctg.2': 'Cardiotocography Data Set (suspect)', 'ctg.3': 'Cardiotocography Data Set (pathologic)', + 'german': 'Statlog German Credit Data', } - data_folder = { - 'acute': 'diagnosis', - 'balance-scale': 'balance-scale', - 'breast-cancer-wisconsin': 'breast-cancer-wisconsin', - 'cmc': 'cmc' + # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use + # to download the raw dataset + identifier_map = { + 'acute.a': 'acute', + 'acute.b': 'acute', + 'balance.1': 'balance-scale', + 'balance.2': 'balance-scale', + 'balance.3': 'balance-scale', + 'breast-cancer': 'breast-cancer-wisconsin', + 'cmc.1': 'cmc', + 'cmc.2': 'cmc', + 'cmc.3': 'cmc', + 'ctg.1': '00193', + 'ctg.2': '00193', + 'ctg.3': '00193', + 'german': 'statlog/german' + } + + # the filename is the name of the file within the data_folder indexed by the identifier + file_name = { + 'acute': 'diagnosis.data', + 'balance-scale': 'balance-scale.data', + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data', + 'cmc': 'cmc.data', + '00193': 'CTG.xls', + 'statlog/german': 'german.data-numeric' + } + + # the filename containing the dataset description (if any) + desc_name = { + 'acute': 'diagnosis.names', + 'balance-scale': 'balance-scale.names', + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names', + 'cmc': 'cmc.names', + '00193': None, + 'statlog/german': 'german.doc' } identifier = identifier_map[dataset_name] URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}' - data_path = join(data_home, 'uci_datasets', identifier) - download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.data', f'{data_path}/{identifier}.data') - download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.names', f'{data_path}/{identifier}.names') + data_dir = join(data_home, 'uci_datasets', identifier) + data_path = join(data_dir, file_name[identifier]) + download_file_if_not_exists(f'{URL}/{file_name[identifier]}', data_path) - if verbose: - print(open(f'{data_path}/{identifier}.names', 'rt').read()) + descfile = desc_name[identifier] + if descfile: + download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}') + if verbose: + print(open(f'{data_dir}/{descfile}', 'rt').read()) + elif verbose: + print('no file description available') print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})') if identifier == 'acute': - df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, encoding='utf-16', sep='\t') + df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t') if dataset_name == 'acute.a': y = binarize(df[6], pos_class='yes') elif dataset_name == 'acute.b': @@ -208,7 +232,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): X = df.loc[:, 0:5].values if identifier == 'balance-scale': - df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') + df = pd.read_csv(data_path, header=None, sep=',') if dataset_name == 'balance.1': y = binarize(df[0], pos_class='L') elif dataset_name == 'balance.2': @@ -218,7 +242,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): X = df.loc[:, 1:].astype(float).values if identifier == 'breast-cancer-wisconsin': - df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') + df = pd.read_csv(data_path, header=None, sep=',') Xy = df.loc[:, 1:10] Xy[Xy=='?']=np.nan Xy = Xy.dropna(axis=0) @@ -227,7 +251,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): y = binarize(Xy[10], pos_class=4) if identifier == 'cmc': - df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') + df = pd.read_csv(data_path, header=None, sep=',') X = df.loc[:, 0:8].astype(float).values y = df[9].astype(int).values if dataset_name == 'cmc.1': @@ -237,25 +261,32 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): elif dataset_name == 'cmc.3': y = binarize(y, pos_class=3) + if identifier == '00193': + df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3) + df = df[list(range(1,24))] # select columns numbered (number 23 is the target label) + # replaces the header with the first row + new_header = df.iloc[0] # grab the first row for the header + df = df[1:] # take the data less the header row + df.columns = new_header # set the header row as the df header + X = df.iloc[:, 0:22].astype(float).values + y = df['NSP'].astype(int).values + if dataset_name == 'ctg.1': # 1==Normal + y = binarize(y, pos_class=1) + elif dataset_name == 'ctg.2': + y = binarize(y, pos_class=2) # 1==Suspect + elif dataset_name == 'ctg.3': + y = binarize(y, pos_class=3) # 1==Pathologic + + if identifier == 'statlog/german': + df = pd.read_csv(data_path, header=None, delim_whitespace=True) + X = df.iloc[:, 0:24].astype(float).values + y = df[24].astype(int).values + y = binarize(y, pos_class=1) + data = LabelledCollection(X, y) data.stats() - raise NotImplementedError() - #print(df) - #print(df.loc[:, 0:5].values) - #print(y) + return Dataset(*data.split_stratified(1-test_split, random_state=0)) -# X = __read_csv(f'{data_path}/{identifier}.data', separator='\t') -# print(X) - - #X, y = from_csv(f'{data_path}/{dataset_name}.data') - #y, classnames = reindex_labels(y) - - -#def __read_csv(path, separator=','): -# x = [] -# for instance in tqdm(open(path, 'rt', encoding='utf-16').readlines(), desc=f'reading {path}'): -# x.append(instance.strip().split(separator)) -# return x def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) \ No newline at end of file diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 5df2474..9674c4d 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -60,7 +60,6 @@ def artificial_sampling_prediction( estim_prevalence = quantification_func(sample.instances) return true_prevalence, estim_prevalence - print('predicting') pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes results = Parallel(n_jobs=n_jobs)( delayed(_predict_prevalences)(index) for index in pbar diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 57c2467..0053767 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -84,7 +84,7 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): def set_params(self, **parameters): if isinstance(self.learner, CalibratedClassifierCV): - parameters={'base_estimator__'+k:v for k,v in parameters.items()} + parameters = {'base_estimator__'+k:v for k,v in parameters.items()} self.learner.set_params(**parameters) @property @@ -172,10 +172,11 @@ class CC(AggregativeQuantifier): class ACC(AggregativeQuantifier): - def __init__(self, learner:BaseEstimator): + def __init__(self, learner:BaseEstimator, val_split=0.4): self.learner = learner + self.val_split = val_split - def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4): + def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection]=None): """ Trains a ACC quantifier :param data: the training set @@ -186,7 +187,8 @@ class ACC(AggregativeQuantifier): to estimate the parameters :return: self """ - assert val_split is not None, 'val_split cannot be set to None' + if val_split is None: + val_split = self.val_split if isinstance(val_split, int): # kFCV estimation of parameters y, y_ = [], [] @@ -256,10 +258,11 @@ class PCC(AggregativeProbabilisticQuantifier): class PACC(AggregativeProbabilisticQuantifier): - def __init__(self, learner:BaseEstimator): + def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner + self.val_split = val_split - def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4): + def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=None): """ Trains a PACC quantifier :param data: the training set @@ -270,7 +273,9 @@ class PACC(AggregativeProbabilisticQuantifier): to estimate the parameters :return: self """ - assert val_split is not None, 'val_split cannot be set to None' + if val_split is None: + val_split = self.val_split + if isinstance(val_split, int): # kFCV estimation of parameters y, y_ = [], [] @@ -374,10 +379,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): estimation based on the Hellinger distance. Information Sciences, 218:146–164. """ - def __init__(self, learner: BaseEstimator): + def __init__(self, learner: BaseEstimator, val_split=0.4): self.learner = learner + self.val_split = val_split - def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.4): + def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None): """ Trains a HDy quantifier :param data: the training set @@ -387,7 +393,9 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): indicating the validation set itself :return: self """ - assert val_split is not None, 'val_split cannot be set to None' + if val_split is None: + val_split = self.val_split + self._check_binary(data, self.__class__.__name__) self.learner, validation = training_helper( self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) @@ -498,7 +506,7 @@ class OneVsAll(AggregativeQuantifier): self.binary_quantifier = binary_quantifier self.n_jobs = n_jobs - def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None): + def fit(self, data: LabelledCollection, fit_learner=True): assert not data.binary, \ f'{self.__class__.__name__} expect non-binary data' assert isinstance(self.binary_quantifier, BaseQuantifier), \ diff --git a/quapy/method/meta.py b/quapy/method/meta.py index 7c818e4..6251242 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -34,16 +34,26 @@ class Ensemble(BaseQuantifier): Information Fusion, 45, 1-15. """ - def __init__(self, quantifier: BaseQuantifier, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, verbose=True, max_sample_size=None): + def __init__(self, + quantifier: BaseQuantifier, + size=50, + red_size=25, + min_pos=1, + policy='ave', + max_sample_size=None, + val_split=None, + n_jobs=1, + verbose=False): assert policy in Ensemble.VALID_POLICIES, \ f'unknown policy={policy}; valid are {Ensemble.VALID_POLICIES}' assert max_sample_size is None or max_sample_size > 0, \ - 'wrong value for max_sample_size; set to a positive number or None' + 'wrong value for max_sample_size; set it to a positive number or None' self.base_quantifier = quantifier self.size = size self.min_pos = min_pos self.red_size = red_size self.policy = policy + self.val_split = val_split self.n_jobs = n_jobs self.post_proba_fn = None self.verbose = verbose @@ -53,10 +63,12 @@ class Ensemble(BaseQuantifier): if self.verbose: print('[Ensemble]' + msg) - def fit(self, data: qp.data.LabelledCollection, val_split:Union[qp.data.LabelledCollection, float]=None): + def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float]=None): self.sout('Fit') if self.policy=='ds' and not data.binary: raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary') + if val_split is None: + val_split = self.val_split # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than # min_pos positive examples) @@ -71,7 +83,8 @@ class Ensemble(BaseQuantifier): sample_size = len(data) if self.max_sample_size is None else min(self.max_sample_size, len(data)) self.ensemble = Parallel(n_jobs=self.n_jobs)( delayed(_delayed_new_instance)( - self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy, verbose=self.verbose, sample_size=sample_size + self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy, + verbose=self.verbose, sample_size=sample_size ) for prev in tqdm(prevs, desc='fitting ensamble') ) @@ -206,15 +219,20 @@ def _delayed_new_instance(base_quantifier, if verbose: print(f'\tfit-start for prev {F.strprev(prev)}, sample_size={sample_size}') model = deepcopy(base_quantifier) + + if val_split is not None: + if isinstance(val_split, float): + assert 0 < val_split < 1, 'val_split should be in (0,1)' + data, val_split = data.split_stratified(train_prop=1-val_split) + sample_index = data.sampling_index(sample_size, *prev) sample = data.sampling_from_index(sample_index) - if val_split is None: - model.fit(sample) - else: - if isinstance(val_split, float): - assert 0 0, 'sample_size must be a positive integer' self.__check_num_evals(self.n_prevpoints, self.eval_budget, self.n_repetitions, training.n_classes) @@ -158,7 +162,7 @@ class GridSearchQ(BaseQuantifier): model.fit(training) true_prevalences, estim_prevalences = artificial_sampling_prediction( model, val_split, self.sample_size, self.n_prevpoints, self.n_repetitions, n_jobs, self.random_seed, - verbose=True + verbose=False ) score = self.error(true_prevalences, estim_prevalences) diff --git a/test.py b/test.py index 9e0eefe..51fc994 100644 --- a/test.py +++ b/test.py @@ -13,9 +13,7 @@ from quapy.model_selection import GridSearchQ -#qp.datasets.fetch_UCIDataset('acute.b', verbose=True) -#sys.exit(0) qp.environ['SAMPLE_SIZE'] = 500 #param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]} param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']} @@ -26,11 +24,12 @@ binary = False svmperf_home = './svm_perf_quantification' if binary: - dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) + #dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) + dataset = qp.datasets.fetch_UCIDataset('german', verbose=True) #qp.data.preprocessing.index(dataset, inplace=True) else: - dataset = qp.datasets.fetch_twitter('gasp', for_model_selection=False, min_df=5, pickle=True) + dataset = qp.datasets.fetch_twitter('gasp', for_model_selection=True, min_df=5, pickle=True) #dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3) print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}') @@ -57,10 +56,32 @@ print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.tes # model = qp.method.aggregative.ClassifyAndCount(learner) learner = LogisticRegression(max_iter=1000) -model = qp.method.meta.EPACC(learner, size=10, red_size=5, max_sample_size=200) - # param_grid={'C':[1,10,100]}, - # optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5}, - # policy='ptr', n_jobs=1) +#model = qp.method.aggregative.PACC(learner) +#model = qp.method.aggregative.ACC(learner) +model = qp.method.meta.EPACC(learner, size=10, red_size=5, max_sample_size=500, n_jobs=-1, + param_grid={'C':[1,10,100]}, + optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5, 'verbose':True}, + policy='ptr', + val_split=0.4) +""" +Problemas: +- La interfaz es muy fea, hay que conocer practicamente todos los detalles así que no ahorra nada con respecto a crear + un objeto con otros anidados dentro +- El fit genera las prevalences random, y esto hace que despues de la model selection, un nuevo fit tire todo el trabajo + hecho. +- El fit de un GridSearcQ tiene dentro un best_estimator, pero después de la model selection, hacer fit otra vez sobre + este objeto no se limita a re-entrenar el modelo con los mejores parámetros, sino que inicia una nueva búsqueda + en modo grid search. +- Posible solución (no vale): sería hacer directamente model selection con el benchmark final, aunque esto haría que los hyper- + parámetros se buscasen en un conjunto diferente del resto de models.... +- Posible solución: + - Elegir las prevalences en init + - +- Problema: el parámetro val_split es muy ambiguo en todo el framework. Por ejemplo, en EPACC podría ser un float que, + en el caso de un GridSearchQ podría referir al split de validación para los hyperparámetros o al split que usa PACC + para encontrar los parámetros... +""" + # regressor = LinearSVR(max_iter=10000) # param_grid = {'C': np.logspace(-1,3,5)} # model = AveragePoolQuantification(regressor, sample_size, trials=5000, n_components=500, zscore=False)