From 3aaf57f2f3981379c114eed691e6b027e1c0cdbc Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 28 Jan 2021 18:22:43 +0100 Subject: [PATCH] =?UTF-8?q?all=20uci=20datasets=20from=20P=C3=A9rez-G?= =?UTF-8?q?=C3=A1llego=20added,=20quantification=20report=20added?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TweetSentQuant/evaluate_results.py | 9 +- TweetSentQuant/experiments.py | 10 +-- quapy/data/base.py | 13 ++- quapy/data/datasets.py | 133 +++++++++++++++++++++-------- quapy/data/reader.py | 3 +- quapy/evaluation.py | 63 ++++++++++++-- quapy/method/meta.py | 25 ++---- test.py | 39 ++++++++- 8 files changed, 222 insertions(+), 73 deletions(-) diff --git a/TweetSentQuant/evaluate_results.py b/TweetSentQuant/evaluate_results.py index a8aba9d..2b8a4d0 100644 --- a/TweetSentQuant/evaluate_results.py +++ b/TweetSentQuant/evaluate_results.py @@ -1,3 +1,4 @@ +import numpy as np import quapy as qp import settings import os @@ -11,8 +12,10 @@ qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE resultdir = './results' methods = ['*'] + def evaluate_results(methods, datasets, error_name): results_str = [] + all = [] error = qp.error.from_name(error_name) for method, dataset in itertools.product(methods, datasets): for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'): @@ -21,8 +24,12 @@ def evaluate_results(methods, datasets, error_name): result = error(true_prevalences, estim_prevalences) string = f'{pathlib.Path(experiment).name}: {result:.3f}' results_str.append(string) + all.append(result) results_str = sorted(results_str) for r in results_str: print(r) + print() + print(f'Ave: {np.mean(all):.3f}') -evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae') \ No newline at end of file + +evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae') diff --git a/TweetSentQuant/experiments.py b/TweetSentQuant/experiments.py index 7e3f0e9..3f3c2d7 100644 --- a/TweetSentQuant/experiments.py +++ b/TweetSentQuant/experiments.py @@ -58,7 +58,7 @@ def quantification_ensembles(): 'verbose': False } common={ - 'max_sample_size': 500, + 'max_sample_size': 1000, 'n_jobs': settings.ENSEMBLE_N_JOBS, 'param_grid': lr_params, 'param_mod_sel': param_mod_sel, @@ -69,13 +69,13 @@ def quantification_ensembles(): # hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection # will be skipped (by setting hyperparameters to None) hyper_none = None - yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none - yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none + #yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none + yield 'epaccmaemae1k', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none # yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none # yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none - yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none - yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none + #yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none + #yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none #yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none #yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none diff --git a/quapy/data/base.py b/quapy/data/base.py index 0fed0d7..ffa1e33 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -1,7 +1,7 @@ import numpy as np from scipy.sparse import issparse from scipy.sparse import vstack -from sklearn.model_selection import train_test_split +from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from quapy.functional import artificial_prevalence_sampling, strprev @@ -151,6 +151,12 @@ class LabelledCollection: f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') return stats_ + def kFCV(self, nfolds=5, nrepeats=1, random_state=0): + kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) + for train_index, test_index in kf.split(*self.Xy): + train = self.sampling_from_index(train_index) + test = self.sampling_from_index(test_index) + yield train, test class Dataset: @@ -190,6 +196,11 @@ class Dataset: f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') + @classmethod + def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0): + for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)): + yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})') + def isbinary(data): if isinstance(data, Dataset) or isinstance(data, LabelledCollection): diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 00c4d7d..15a3921 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -1,7 +1,12 @@ +def warn(*args, **kwargs): + pass +import warnings +warnings.warn = warn import os import zipfile from os.path import join from urllib.error import HTTPError +from sklearn.model_selection import StratifiedKFold import pandas as pd @@ -17,6 +22,29 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders', TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', 'semeval', 'semeval16', 'sst', 'wa', 'wb'] +UCI_DATASETS = ['acute.a', 'acute.b', + 'balance.1', 'balance.2', 'balance.3', + 'breast-cancer', + 'cmc.1', 'cmc.2', 'cmc.3', + 'ctg.1', 'ctg.2', 'ctg.3', + #'diabetes', # <-- I haven't found this one... + 'german', + 'haberman', + 'ionosphere', + 'iris.1', 'iris.2', 'iris.3', + 'mammographic', + 'pageblocks.5', + #'phoneme', # <-- I haven't found this one... + 'semeion', + 'sonar', + 'spambase', + 'spectf', + 'tictactoe', + 'transfusion', + 'wdbc', + 'wine.1', 'wine.2', 'wine.3', + 'wine-q-red', 'wine-q-white', + 'yeast'] def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False): @@ -134,27 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom return data -UCI_DATASETS = ['acute.a', 'acute.b', - 'balance.1', 'balance.2', 'balance.3', - 'breast-cancer', - 'cmc.1', 'cmc.2', 'cmc.3', - 'ctg.1', 'ctg.2', 'ctg.3', - #'diabetes', # <-- I haven't found this one... - 'german', - 'haberman', - 'ionosphere', - 'iris.1', 'iris.2', 'iris.3', - 'mammographic', - 'pageblocks.5', - #'phoneme', # <-- I haven't found this one... - 'semeion', - 'sonar', - 'spambase', - 'spectf', - 'tictactoe', - 'transfusion'] # ongoing... +def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False): + data = fetch_UCILabelledCollection(dataset_name, data_home, verbose) + return Dataset(*data.split_stratified(1 - test_split, random_state=0)) -def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3): + +def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False): assert dataset_name in UCI_DATASETS, \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ @@ -188,7 +201,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 'spambase': 'Spambase Data Set', 'spectf': 'SPECTF Heart Data', 'tictactoe': 'Tic-Tac-Toe Endgame Database', - 'transfusion': 'Blood Transfusion Service Center Data Set ' + 'transfusion': 'Blood Transfusion Service Center Data Set', + 'wdbc': 'Wisconsin Diagnostic Breast Cancer', + 'wine.1': 'Wine Recognition Data (1)', + 'wine.2': 'Wine Recognition Data (2)', + 'wine.3': 'Wine Recognition Data (3)', + 'wine-q-red': 'Wine Quality Red (6-10)', + 'wine-q-white': 'Wine Quality White (6-10)', + 'yeast': 'Yeast', } # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use @@ -219,7 +239,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 'spambase': 'spambase', 'spectf': 'spect', 'tictactoe': 'tic-tac-toe', - 'transfusion': 'blood-transfusion' + 'transfusion': 'blood-transfusion', + 'wdbc': 'breast-cancer-wisconsin', + 'wine-q-red': 'wine-quality', + 'wine-q-white': 'wine-quality', + 'wine.1': 'wine', + 'wine.2': 'wine', + 'wine.3': 'wine', + 'yeast': 'yeast', } # the filename is the name of the file within the data_folder indexed by the identifier @@ -231,7 +258,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 'page-blocks': 'page-blocks.data.Z', 'undocumented/connectionist-bench/sonar': 'sonar.all-data', 'spect': ['SPECTF.train', 'SPECTF.test'], - 'blood-transfusion': 'transfusion.data' + 'blood-transfusion': 'transfusion.data', + 'wine-quality': ['winequality-red.csv', 'winequality-white.csv'], + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data' } # the filename containing the dataset description (if any) @@ -242,7 +271,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 'mammographic-masses': 'mammographic_masses.names', 'undocumented/connectionist-bench/sonar': 'sonar.names', 'spect': 'SPECTF.names', - 'blood-transfusion': 'transfusion.names' + 'blood-transfusion': 'transfusion.names', + 'wine-quality': 'winequality.names', + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names' } identifier = identifier_map[dataset_name] @@ -269,16 +300,15 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 print(f'Loading {dataset_name} ({fullname})') if identifier == 'acute': df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t') + + df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False) + [df_replace(df, col) for col in range(1, 6)] + X = df.loc[:, 0:5].values if dataset_name == 'acute.a': y = binarize(df[6], pos_class='yes') elif dataset_name == 'acute.b': y = binarize(df[7], pos_class='yes') - mintemp, maxtemp = 35, 42 - df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False) - [df_replace(df, col) for col in range(1, 6)] - X = df.loc[:, 0:5].values - if identifier == 'balance-scale': df = pd.read_csv(data_path, header=None, sep=',') if dataset_name == 'balance.1': @@ -289,14 +319,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 y = binarize(df[0], pos_class='R') X = df.loc[:, 1:].astype(float).values - if identifier == 'breast-cancer-wisconsin': + if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer': df = pd.read_csv(data_path, header=None, sep=',') Xy = df.loc[:, 1:10] Xy[Xy=='?']=np.nan Xy = Xy.dropna(axis=0) X = Xy.loc[:, 1:9] X = X.astype(float).values - y = binarize(Xy[10], pos_class=4) + y = binarize(Xy[10], pos_class=2) + + if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.loc[:, 2:32].astype(float).values + y = df[1].values + y = binarize(y, pos_class='M') if identifier == 'cmc': df = pd.read_csv(data_path, header=None, sep=',') @@ -356,8 +392,8 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 if identifier == 'mammographic-masses': df = pd.read_csv(data_path, header=None, sep=',') - Xy[df == '?'] = np.nan - Xy = Xy.dropna(axis=0) + df[df == '?'] = np.nan + Xy = df.dropna(axis=0) X = Xy.iloc[:, 0:5] X = X.astype(float).values y = binarize(Xy.iloc[:,5], pos_class=1) @@ -395,9 +431,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 if identifier == 'spect': dfs = [] - for file in filename: + for file in filename: data_path = join(data_dir, file) - download_file_if_not_exists(f'{URL}/{filename}', data_path) + download_file_if_not_exists(f'{URL}/{file}', data_path) dfs.append(pd.read_csv(data_path, header=None, sep=',')) df = pd.concat(dfs) X = df.iloc[:, 1:45].astype(float).values @@ -416,9 +452,34 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3 y = df.iloc[:, 4].values y = binarize(y, pos_class=1) + if identifier == 'wine': + df = pd.read_csv(data_path, header=None, sep=',') + X = df.iloc[:, 1:14].astype(float).values + y = df[0].values + if dataset_name == 'wine.1': + y = binarize(y, pos_class=1) + elif dataset_name == 'wine.2': + y = binarize(y, pos_class=2) + elif dataset_name == 'wine.3': + y = binarize(y, pos_class=3) + + if identifier == 'wine-quality': + filename = filename[0] if dataset_name=='wine-q-red' else filename[1] + data_path = join(data_dir, filename) + download_file_if_not_exists(f'{URL}/{filename}', data_path) + df = pd.read_csv(data_path, sep=';') + X = df.iloc[:, 0:11].astype(float).values + y = df.iloc[:, 11].values > 5 + + if identifier == 'yeast': + df = pd.read_csv(data_path, header=None, delim_whitespace=True) + X = df.iloc[:, 1:9].astype(float).values + y = df.iloc[:, 9].values + y = binarize(y, pos_class='NUC') + data = LabelledCollection(X, y) data.stats() - return Dataset(*data.split_stratified(1-test_split, random_state=0)) + return data def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): diff --git a/quapy/data/reader.py b/quapy/data/reader.py index f7e45f4..743b99e 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -93,4 +93,5 @@ def binarize(y, pos_class): y = np.asarray(y) ybin = np.zeros(y.shape, dtype=np.int) ybin[y == pos_class] = 1 - return ybin \ No newline at end of file + return ybin + diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 02e1c1c..5f5205c 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -9,7 +9,7 @@ from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier from quapy.util import temp_seed import quapy.functional as F - +import pandas as pd def artificial_sampling_prediction( model: BaseQuantifier, @@ -62,9 +62,6 @@ def artificial_sampling_prediction( pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs) - # results = Parallel(n_jobs=n_jobs)( - # delayed(_predict_prevalences)(index) for index in pbar - # ) true_prevalences, estim_prevalences = zip(*results) true_prevalences = np.asarray(true_prevalences) @@ -73,13 +70,65 @@ def artificial_sampling_prediction( return true_prevalences, estim_prevalences +def artificial_sampling_report( + model: BaseQuantifier, + test: LabelledCollection, + sample_size, + n_prevpoints=210, + n_repetitions=1, + n_jobs=1, + random_seed=42, + error_metrics:Iterable[Union[str,Callable]]='mae', + verbose=True): + + if isinstance(error_metrics, str): + error_metrics=[error_metrics] + + error_names = [e if isinstance(e, str) else e.__name__ for e in error_metrics] + error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics] + assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions' + + df = pd.DataFrame(columns=['true-prev', 'estim-prev']+error_names) + true_prevs, estim_prevs = artificial_sampling_prediction( + model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose + ) + for true_prev, estim_prev in zip(true_prevs, estim_prevs): + series = {'true-prev': true_prev, 'estim-prev': estim_prev} + for error_name, error_metric in zip(error_names, error_funcs): + score = error_metric(true_prev, estim_prev) + series[error_name] = score + df = df.append(series, ignore_index=True) + + return df + + +def artificial_sampling_eval( + model: BaseQuantifier, + test: LabelledCollection, + sample_size, + n_prevpoints=210, + n_repetitions=1, + n_jobs=1, + random_seed=42, + error_metric:Union[str,Callable]='mae', + verbose=True): + + if isinstance(error_metric, str): + error_metric = qp.error.from_name(error_metric) + + assert hasattr(error_metric, '__call__'), 'invalid error function' + + true_prevs, estim_prevs = artificial_sampling_prediction( + model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose + ) + + return error_metric(true_prevs, estim_prevs) + + def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1): if isinstance(err, str): err = qp.error.from_name(err) scores = qp.util.parallel(_delayed_eval, ((model, Ti, err) for Ti in test_samples), n_jobs=n_jobs) - # scores = Parallel(n_jobs=n_jobs)( - # delayed(_delayed_eval)(model, Ti, err) for Ti in test_samples - # ) return np.mean(scores) diff --git a/quapy/method/meta.py b/quapy/method/meta.py index 8849394..5088e39 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -38,7 +38,7 @@ class Ensemble(BaseQuantifier): quantifier: BaseQuantifier, size=50, red_size=25, - min_pos=1, + min_pos=5, policy='ave', max_sample_size=None, val_split=None, @@ -88,15 +88,8 @@ class Ensemble(BaseQuantifier): ) self.ensemble = qp.util.parallel( _delayed_new_instance, - tqdm(args, desc='fitting ensamble', total=self.size), + tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args, n_jobs=self.n_jobs) - # self.ensemble = Parallel(n_jobs=self.n_jobs)( - # delayed(_delayed_new_instance)( - # self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy, - # verbose=self.verbose, sample_size=sample_size - # ) for prev in tqdm(prevs, desc='fitting ensamble') - # ) - # static selection policy (the name of a quantification-oriented error function to minimize) if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES: @@ -109,9 +102,6 @@ class Ensemble(BaseQuantifier): predictions = np.asarray( qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs) ) - # predictions = np.asarray(Parallel(n_jobs=self.n_jobs)( - # delayed(_delayed_quantify)(Qi, instances) for Qi in self.ensemble - # )) if self.policy == 'ptr': predictions = self.ptr_policy(predictions) @@ -143,7 +133,7 @@ class Ensemble(BaseQuantifier): scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs)) order = np.argsort(scores) - self.ensemble = select_k(self.ensemble, order, k=self.red_size) + self.ensemble = _select_k(self.ensemble, order, k=self.red_size) def ptr_policy(self, predictions): """ @@ -154,7 +144,7 @@ class Ensemble(BaseQuantifier): tr_prevs = [m[1] for m in self.ensemble] ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs] order = np.argsort(ptr_differences) - return select_k(predictions, order, k=self.red_size) + return _select_k(predictions, order, k=self.red_size) def ds_policy_get_posteriors(self, data: LabelledCollection): """ @@ -192,7 +182,7 @@ class Ensemble(BaseQuantifier): tr_distributions = [m[2] for m in self.ensemble] dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions] order = np.argsort(dist) - return select_k(predictions, order, k=self.red_size) + return _select_k(predictions, order, k=self.red_size) @property def binary(self): @@ -201,13 +191,10 @@ class Ensemble(BaseQuantifier): @property def aggregative(self): return False - #raise NotImplementedError('aggregative functionality not yet supported for Ensemble') @property def probabilistic(self): return False - #raise NotImplementedError('probabilistic functionality not yet supported for Ensemble') - #return self.base_quantifier.probabilistic def get_probability_distribution(posterior_probabilities, bins=8): @@ -217,7 +204,7 @@ def get_probability_distribution(posterior_probabilities, bins=8): return distribution -def select_k(elements, order, k): +def _select_k(elements, order, k): return [elements[idx] for idx in order[:k]] diff --git a/test.py b/test.py index b301c54..b7b75f3 100644 --- a/test.py +++ b/test.py @@ -8,15 +8,48 @@ import numpy as np from NewMethods.methods import AveragePoolQuantification from classification.methods import PCALR -from classification.neural import NeuralClassifierTrainer, CNNnet +from data import Dataset from method.meta import EPACC from quapy.model_selection import GridSearchQ +from tqdm import tqdm +import pandas as pd + +sample_size=100 +qp.environ['SAMPLE_SIZE'] = sample_size + +np.random.seed(0) + +nfolds=5 +nrepeats=1 + +df = pd.DataFrame(columns=['dataset', 'method', 'mse']) +for datasetname in qp.datasets.UCI_DATASETS[2:]: + collection = qp.datasets.fetch_UCILabelledCollection(datasetname, verbose=False) + scores = [] + pbar = tqdm(Dataset.kFCV(collection, nfolds=nfolds, nrepeats=nrepeats), total=nfolds*nrepeats) + for data in pbar: + pbar.set_description(f'{data.name}') + # learner = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid={'C': np.logspace(-3,3,7)}, n_jobs=-1) + learner = LogisticRegression(class_weight='balanced') + # model = qp.method.aggregative.CC(learner) + model = qp.method.meta.EHDy(learner, size=30, red_size=15, verbose=False) + model.fit(data.training) + err = qp.evaluation.artificial_sampling_eval(model, data.test, sample_size, n_prevpoints=101, n_jobs=-1, + error_metric='mse', verbose=False) + scores.append(err) + + score = np.mean(scores) + df = df.append({ + 'dataset': datasetname, + 'method': model.__class__.__name__, + 'mse': score + }, ignore_index=True) + print(df) -dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True) sys.exit(0) -qp.environ['SAMPLE_SIZE'] = 500 + #param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]} param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']} max_evaluations = 500