all uci datasets from Pérez-Gállego added, quantification report added

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-28 18:22:43 +01:00
parent 1d89301089
commit 3aaf57f2f3
8 changed files with 222 additions and 73 deletions

View File

@ -1,3 +1,4 @@
import numpy as np
import quapy as qp import quapy as qp
import settings import settings
import os import os
@ -11,8 +12,10 @@ qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
resultdir = './results' resultdir = './results'
methods = ['*'] methods = ['*']
def evaluate_results(methods, datasets, error_name): def evaluate_results(methods, datasets, error_name):
results_str = [] results_str = []
all = []
error = qp.error.from_name(error_name) error = qp.error.from_name(error_name)
for method, dataset in itertools.product(methods, datasets): for method, dataset in itertools.product(methods, datasets):
for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'): for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
@ -21,8 +24,12 @@ def evaluate_results(methods, datasets, error_name):
result = error(true_prevalences, estim_prevalences) result = error(true_prevalences, estim_prevalences)
string = f'{pathlib.Path(experiment).name}: {result:.3f}' string = f'{pathlib.Path(experiment).name}: {result:.3f}'
results_str.append(string) results_str.append(string)
all.append(result)
results_str = sorted(results_str) results_str = sorted(results_str)
for r in results_str: for r in results_str:
print(r) print(r)
print()
print(f'Ave: {np.mean(all):.3f}')
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')

View File

@ -58,7 +58,7 @@ def quantification_ensembles():
'verbose': False 'verbose': False
} }
common={ common={
'max_sample_size': 500, 'max_sample_size': 1000,
'n_jobs': settings.ENSEMBLE_N_JOBS, 'n_jobs': settings.ENSEMBLE_N_JOBS,
'param_grid': lr_params, 'param_grid': lr_params,
'param_mod_sel': param_mod_sel, 'param_mod_sel': param_mod_sel,
@ -69,13 +69,13 @@ def quantification_ensembles():
# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection # hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
# will be skipped (by setting hyperparameters to None) # will be skipped (by setting hyperparameters to None)
hyper_none = None hyper_none = None
yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none #yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none yield 'epaccmaemae1k', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
# yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none # yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none
# yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none # yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none
yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none #yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none #yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
#yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none #yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none
#yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none #yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none

View File

@ -1,7 +1,7 @@
import numpy as np import numpy as np
from scipy.sparse import issparse from scipy.sparse import issparse
from scipy.sparse import vstack from scipy.sparse import vstack
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from quapy.functional import artificial_prevalence_sampling, strprev from quapy.functional import artificial_prevalence_sampling, strprev
@ -151,6 +151,12 @@ class LabelledCollection:
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
return stats_ return stats_
def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
for train_index, test_index in kf.split(*self.Xy):
train = self.sampling_from_index(train_index)
test = self.sampling_from_index(test_index)
yield train, test
class Dataset: class Dataset:
@ -190,6 +196,11 @@ class Dataset:
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
@classmethod
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})')
def isbinary(data): def isbinary(data):
if isinstance(data, Dataset) or isinstance(data, LabelledCollection): if isinstance(data, Dataset) or isinstance(data, LabelledCollection):

View File

@ -1,7 +1,12 @@
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
import os import os
import zipfile import zipfile
from os.path import join from os.path import join
from urllib.error import HTTPError from urllib.error import HTTPError
from sklearn.model_selection import StratifiedKFold
import pandas as pd import pandas as pd
@ -17,6 +22,29 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
'semeval', 'semeval16', 'semeval', 'semeval16',
'sst', 'wa', 'wb'] 'sst', 'wa', 'wb']
UCI_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3',
'breast-cancer',
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3',
#'diabetes', # <-- I haven't found this one...
'german',
'haberman',
'ionosphere',
'iris.1', 'iris.2', 'iris.3',
'mammographic',
'pageblocks.5',
#'phoneme', # <-- I haven't found this one...
'semeion',
'sonar',
'spambase',
'spectf',
'tictactoe',
'transfusion',
'wdbc',
'wine.1', 'wine.2', 'wine.3',
'wine-q-red', 'wine-q-white',
'yeast']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False): def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
@ -134,27 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
return data return data
UCI_DATASETS = ['acute.a', 'acute.b', def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False):
'balance.1', 'balance.2', 'balance.3', data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
'breast-cancer', return Dataset(*data.split_stratified(1 - test_split, random_state=0))
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3',
#'diabetes', # <-- I haven't found this one...
'german',
'haberman',
'ionosphere',
'iris.1', 'iris.2', 'iris.3',
'mammographic',
'pageblocks.5',
#'phoneme', # <-- I haven't found this one...
'semeion',
'sonar',
'spambase',
'spectf',
'tictactoe',
'transfusion'] # ongoing...
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False):
assert dataset_name in UCI_DATASETS, \ assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
@ -188,7 +201,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'spambase': 'Spambase Data Set', 'spambase': 'Spambase Data Set',
'spectf': 'SPECTF Heart Data', 'spectf': 'SPECTF Heart Data',
'tictactoe': 'Tic-Tac-Toe Endgame Database', 'tictactoe': 'Tic-Tac-Toe Endgame Database',
'transfusion': 'Blood Transfusion Service Center Data Set ' 'transfusion': 'Blood Transfusion Service Center Data Set',
'wdbc': 'Wisconsin Diagnostic Breast Cancer',
'wine.1': 'Wine Recognition Data (1)',
'wine.2': 'Wine Recognition Data (2)',
'wine.3': 'Wine Recognition Data (3)',
'wine-q-red': 'Wine Quality Red (6-10)',
'wine-q-white': 'Wine Quality White (6-10)',
'yeast': 'Yeast',
} }
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
@ -219,7 +239,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'spambase': 'spambase', 'spambase': 'spambase',
'spectf': 'spect', 'spectf': 'spect',
'tictactoe': 'tic-tac-toe', 'tictactoe': 'tic-tac-toe',
'transfusion': 'blood-transfusion' 'transfusion': 'blood-transfusion',
'wdbc': 'breast-cancer-wisconsin',
'wine-q-red': 'wine-quality',
'wine-q-white': 'wine-quality',
'wine.1': 'wine',
'wine.2': 'wine',
'wine.3': 'wine',
'yeast': 'yeast',
} }
# the filename is the name of the file within the data_folder indexed by the identifier # the filename is the name of the file within the data_folder indexed by the identifier
@ -231,7 +258,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'page-blocks': 'page-blocks.data.Z', 'page-blocks': 'page-blocks.data.Z',
'undocumented/connectionist-bench/sonar': 'sonar.all-data', 'undocumented/connectionist-bench/sonar': 'sonar.all-data',
'spect': ['SPECTF.train', 'SPECTF.test'], 'spect': ['SPECTF.train', 'SPECTF.test'],
'blood-transfusion': 'transfusion.data' 'blood-transfusion': 'transfusion.data',
'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
} }
# the filename containing the dataset description (if any) # the filename containing the dataset description (if any)
@ -242,7 +271,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'mammographic-masses': 'mammographic_masses.names', 'mammographic-masses': 'mammographic_masses.names',
'undocumented/connectionist-bench/sonar': 'sonar.names', 'undocumented/connectionist-bench/sonar': 'sonar.names',
'spect': 'SPECTF.names', 'spect': 'SPECTF.names',
'blood-transfusion': 'transfusion.names' 'blood-transfusion': 'transfusion.names',
'wine-quality': 'winequality.names',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
} }
identifier = identifier_map[dataset_name] identifier = identifier_map[dataset_name]
@ -269,16 +300,15 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
print(f'Loading {dataset_name} ({fullname})') print(f'Loading {dataset_name} ({fullname})')
if identifier == 'acute': if identifier == 'acute':
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t') df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
[df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if dataset_name == 'acute.a': if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes') y = binarize(df[6], pos_class='yes')
elif dataset_name == 'acute.b': elif dataset_name == 'acute.b':
y = binarize(df[7], pos_class='yes') y = binarize(df[7], pos_class='yes')
mintemp, maxtemp = 35, 42
df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
[df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if identifier == 'balance-scale': if identifier == 'balance-scale':
df = pd.read_csv(data_path, header=None, sep=',') df = pd.read_csv(data_path, header=None, sep=',')
if dataset_name == 'balance.1': if dataset_name == 'balance.1':
@ -289,14 +319,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
y = binarize(df[0], pos_class='R') y = binarize(df[0], pos_class='R')
X = df.loc[:, 1:].astype(float).values X = df.loc[:, 1:].astype(float).values
if identifier == 'breast-cancer-wisconsin': if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
df = pd.read_csv(data_path, header=None, sep=',') df = pd.read_csv(data_path, header=None, sep=',')
Xy = df.loc[:, 1:10] Xy = df.loc[:, 1:10]
Xy[Xy=='?']=np.nan Xy[Xy=='?']=np.nan
Xy = Xy.dropna(axis=0) Xy = Xy.dropna(axis=0)
X = Xy.loc[:, 1:9] X = Xy.loc[:, 1:9]
X = X.astype(float).values X = X.astype(float).values
y = binarize(Xy[10], pos_class=4) y = binarize(Xy[10], pos_class=2)
if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.loc[:, 2:32].astype(float).values
y = df[1].values
y = binarize(y, pos_class='M')
if identifier == 'cmc': if identifier == 'cmc':
df = pd.read_csv(data_path, header=None, sep=',') df = pd.read_csv(data_path, header=None, sep=',')
@ -356,8 +392,8 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
if identifier == 'mammographic-masses': if identifier == 'mammographic-masses':
df = pd.read_csv(data_path, header=None, sep=',') df = pd.read_csv(data_path, header=None, sep=',')
Xy[df == '?'] = np.nan df[df == '?'] = np.nan
Xy = Xy.dropna(axis=0) Xy = df.dropna(axis=0)
X = Xy.iloc[:, 0:5] X = Xy.iloc[:, 0:5]
X = X.astype(float).values X = X.astype(float).values
y = binarize(Xy.iloc[:,5], pos_class=1) y = binarize(Xy.iloc[:,5], pos_class=1)
@ -395,9 +431,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
if identifier == 'spect': if identifier == 'spect':
dfs = [] dfs = []
for file in filename: for file in filename:
data_path = join(data_dir, file) data_path = join(data_dir, file)
download_file_if_not_exists(f'{URL}/{filename}', data_path) download_file_if_not_exists(f'{URL}/{file}', data_path)
dfs.append(pd.read_csv(data_path, header=None, sep=',')) dfs.append(pd.read_csv(data_path, header=None, sep=','))
df = pd.concat(dfs) df = pd.concat(dfs)
X = df.iloc[:, 1:45].astype(float).values X = df.iloc[:, 1:45].astype(float).values
@ -416,9 +452,34 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
y = df.iloc[:, 4].values y = df.iloc[:, 4].values
y = binarize(y, pos_class=1) y = binarize(y, pos_class=1)
if identifier == 'wine':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 1:14].astype(float).values
y = df[0].values
if dataset_name == 'wine.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'wine.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'wine.3':
y = binarize(y, pos_class=3)
if identifier == 'wine-quality':
filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
df = pd.read_csv(data_path, sep=';')
X = df.iloc[:, 0:11].astype(float).values
y = df.iloc[:, 11].values > 5
if identifier == 'yeast':
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
X = df.iloc[:, 1:9].astype(float).values
y = df.iloc[:, 9].values
y = binarize(y, pos_class='NUC')
data = LabelledCollection(X, y) data = LabelledCollection(X, y)
data.stats() data.stats()
return Dataset(*data.split_stratified(1-test_split, random_state=0)) return data
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):

View File

@ -93,4 +93,5 @@ def binarize(y, pos_class):
y = np.asarray(y) y = np.asarray(y)
ybin = np.zeros(y.shape, dtype=np.int) ybin = np.zeros(y.shape, dtype=np.int)
ybin[y == pos_class] = 1 ybin[y == pos_class] = 1
return ybin return ybin

View File

@ -9,7 +9,7 @@ from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier from quapy.method.base import BaseQuantifier
from quapy.util import temp_seed from quapy.util import temp_seed
import quapy.functional as F import quapy.functional as F
import pandas as pd
def artificial_sampling_prediction( def artificial_sampling_prediction(
model: BaseQuantifier, model: BaseQuantifier,
@ -62,9 +62,6 @@ def artificial_sampling_prediction(
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs) results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
# results = Parallel(n_jobs=n_jobs)(
# delayed(_predict_prevalences)(index) for index in pbar
# )
true_prevalences, estim_prevalences = zip(*results) true_prevalences, estim_prevalences = zip(*results)
true_prevalences = np.asarray(true_prevalences) true_prevalences = np.asarray(true_prevalences)
@ -73,13 +70,65 @@ def artificial_sampling_prediction(
return true_prevalences, estim_prevalences return true_prevalences, estim_prevalences
def artificial_sampling_report(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
n_prevpoints=210,
n_repetitions=1,
n_jobs=1,
random_seed=42,
error_metrics:Iterable[Union[str,Callable]]='mae',
verbose=True):
if isinstance(error_metrics, str):
error_metrics=[error_metrics]
error_names = [e if isinstance(e, str) else e.__name__ for e in error_metrics]
error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
df = pd.DataFrame(columns=['true-prev', 'estim-prev']+error_names)
true_prevs, estim_prevs = artificial_sampling_prediction(
model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
)
for true_prev, estim_prev in zip(true_prevs, estim_prevs):
series = {'true-prev': true_prev, 'estim-prev': estim_prev}
for error_name, error_metric in zip(error_names, error_funcs):
score = error_metric(true_prev, estim_prev)
series[error_name] = score
df = df.append(series, ignore_index=True)
return df
def artificial_sampling_eval(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
n_prevpoints=210,
n_repetitions=1,
n_jobs=1,
random_seed=42,
error_metric:Union[str,Callable]='mae',
verbose=True):
if isinstance(error_metric, str):
error_metric = qp.error.from_name(error_metric)
assert hasattr(error_metric, '__call__'), 'invalid error function'
true_prevs, estim_prevs = artificial_sampling_prediction(
model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
)
return error_metric(true_prevs, estim_prevs)
def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1): def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
if isinstance(err, str): if isinstance(err, str):
err = qp.error.from_name(err) err = qp.error.from_name(err)
scores = qp.util.parallel(_delayed_eval, ((model, Ti, err) for Ti in test_samples), n_jobs=n_jobs) scores = qp.util.parallel(_delayed_eval, ((model, Ti, err) for Ti in test_samples), n_jobs=n_jobs)
# scores = Parallel(n_jobs=n_jobs)(
# delayed(_delayed_eval)(model, Ti, err) for Ti in test_samples
# )
return np.mean(scores) return np.mean(scores)

View File

@ -38,7 +38,7 @@ class Ensemble(BaseQuantifier):
quantifier: BaseQuantifier, quantifier: BaseQuantifier,
size=50, size=50,
red_size=25, red_size=25,
min_pos=1, min_pos=5,
policy='ave', policy='ave',
max_sample_size=None, max_sample_size=None,
val_split=None, val_split=None,
@ -88,15 +88,8 @@ class Ensemble(BaseQuantifier):
) )
self.ensemble = qp.util.parallel( self.ensemble = qp.util.parallel(
_delayed_new_instance, _delayed_new_instance,
tqdm(args, desc='fitting ensamble', total=self.size), tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args,
n_jobs=self.n_jobs) n_jobs=self.n_jobs)
# self.ensemble = Parallel(n_jobs=self.n_jobs)(
# delayed(_delayed_new_instance)(
# self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy,
# verbose=self.verbose, sample_size=sample_size
# ) for prev in tqdm(prevs, desc='fitting ensamble')
# )
# static selection policy (the name of a quantification-oriented error function to minimize) # static selection policy (the name of a quantification-oriented error function to minimize)
if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES: if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES:
@ -109,9 +102,6 @@ class Ensemble(BaseQuantifier):
predictions = np.asarray( predictions = np.asarray(
qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs) qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs)
) )
# predictions = np.asarray(Parallel(n_jobs=self.n_jobs)(
# delayed(_delayed_quantify)(Qi, instances) for Qi in self.ensemble
# ))
if self.policy == 'ptr': if self.policy == 'ptr':
predictions = self.ptr_policy(predictions) predictions = self.ptr_policy(predictions)
@ -143,7 +133,7 @@ class Ensemble(BaseQuantifier):
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs)) scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
order = np.argsort(scores) order = np.argsort(scores)
self.ensemble = select_k(self.ensemble, order, k=self.red_size) self.ensemble = _select_k(self.ensemble, order, k=self.red_size)
def ptr_policy(self, predictions): def ptr_policy(self, predictions):
""" """
@ -154,7 +144,7 @@ class Ensemble(BaseQuantifier):
tr_prevs = [m[1] for m in self.ensemble] tr_prevs = [m[1] for m in self.ensemble]
ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs] ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs]
order = np.argsort(ptr_differences) order = np.argsort(ptr_differences)
return select_k(predictions, order, k=self.red_size) return _select_k(predictions, order, k=self.red_size)
def ds_policy_get_posteriors(self, data: LabelledCollection): def ds_policy_get_posteriors(self, data: LabelledCollection):
""" """
@ -192,7 +182,7 @@ class Ensemble(BaseQuantifier):
tr_distributions = [m[2] for m in self.ensemble] tr_distributions = [m[2] for m in self.ensemble]
dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions] dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
order = np.argsort(dist) order = np.argsort(dist)
return select_k(predictions, order, k=self.red_size) return _select_k(predictions, order, k=self.red_size)
@property @property
def binary(self): def binary(self):
@ -201,13 +191,10 @@ class Ensemble(BaseQuantifier):
@property @property
def aggregative(self): def aggregative(self):
return False return False
#raise NotImplementedError('aggregative functionality not yet supported for Ensemble')
@property @property
def probabilistic(self): def probabilistic(self):
return False return False
#raise NotImplementedError('probabilistic functionality not yet supported for Ensemble')
#return self.base_quantifier.probabilistic
def get_probability_distribution(posterior_probabilities, bins=8): def get_probability_distribution(posterior_probabilities, bins=8):
@ -217,7 +204,7 @@ def get_probability_distribution(posterior_probabilities, bins=8):
return distribution return distribution
def select_k(elements, order, k): def _select_k(elements, order, k):
return [elements[idx] for idx in order[:k]] return [elements[idx] for idx in order[:k]]

39
test.py
View File

@ -8,15 +8,48 @@ import numpy as np
from NewMethods.methods import AveragePoolQuantification from NewMethods.methods import AveragePoolQuantification
from classification.methods import PCALR from classification.methods import PCALR
from classification.neural import NeuralClassifierTrainer, CNNnet from data import Dataset
from method.meta import EPACC from method.meta import EPACC
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from tqdm import tqdm
import pandas as pd
sample_size=100
qp.environ['SAMPLE_SIZE'] = sample_size
np.random.seed(0)
nfolds=5
nrepeats=1
df = pd.DataFrame(columns=['dataset', 'method', 'mse'])
for datasetname in qp.datasets.UCI_DATASETS[2:]:
collection = qp.datasets.fetch_UCILabelledCollection(datasetname, verbose=False)
scores = []
pbar = tqdm(Dataset.kFCV(collection, nfolds=nfolds, nrepeats=nrepeats), total=nfolds*nrepeats)
for data in pbar:
pbar.set_description(f'{data.name}')
# learner = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid={'C': np.logspace(-3,3,7)}, n_jobs=-1)
learner = LogisticRegression(class_weight='balanced')
# model = qp.method.aggregative.CC(learner)
model = qp.method.meta.EHDy(learner, size=30, red_size=15, verbose=False)
model.fit(data.training)
err = qp.evaluation.artificial_sampling_eval(model, data.test, sample_size, n_prevpoints=101, n_jobs=-1,
error_metric='mse', verbose=False)
scores.append(err)
score = np.mean(scores)
df = df.append({
'dataset': datasetname,
'method': model.__class__.__name__,
'mse': score
}, ignore_index=True)
print(df)
dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True)
sys.exit(0) sys.exit(0)
qp.environ['SAMPLE_SIZE'] = 500
#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]} #param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']} param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
max_evaluations = 500 max_evaluations = 500