all uci datasets from Pérez-Gállego added, quantification report added

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-28 18:22:43 +01:00
parent 1d89301089
commit 3aaf57f2f3
8 changed files with 222 additions and 73 deletions

View File

@ -1,3 +1,4 @@
import numpy as np
import quapy as qp
import settings
import os
@ -11,8 +12,10 @@ qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
resultdir = './results'
methods = ['*']
def evaluate_results(methods, datasets, error_name):
results_str = []
all = []
error = qp.error.from_name(error_name)
for method, dataset in itertools.product(methods, datasets):
for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
@ -21,8 +24,12 @@ def evaluate_results(methods, datasets, error_name):
result = error(true_prevalences, estim_prevalences)
string = f'{pathlib.Path(experiment).name}: {result:.3f}'
results_str.append(string)
all.append(result)
results_str = sorted(results_str)
for r in results_str:
print(r)
print()
print(f'Ave: {np.mean(all):.3f}')
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')

View File

@ -58,7 +58,7 @@ def quantification_ensembles():
'verbose': False
}
common={
'max_sample_size': 500,
'max_sample_size': 1000,
'n_jobs': settings.ENSEMBLE_N_JOBS,
'param_grid': lr_params,
'param_mod_sel': param_mod_sel,
@ -69,13 +69,13 @@ def quantification_ensembles():
# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
# will be skipped (by setting hyperparameters to None)
hyper_none = None
yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
#yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
yield 'epaccmaemae1k', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
# yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none
# yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none
yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
#yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
#yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
#yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none
#yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none

View File

@ -1,7 +1,7 @@
import numpy as np
from scipy.sparse import issparse
from scipy.sparse import vstack
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from quapy.functional import artificial_prevalence_sampling, strprev
@ -151,6 +151,12 @@ class LabelledCollection:
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
return stats_
def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
for train_index, test_index in kf.split(*self.Xy):
train = self.sampling_from_index(train_index)
test = self.sampling_from_index(test_index)
yield train, test
class Dataset:
@ -190,6 +196,11 @@ class Dataset:
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
@classmethod
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})')
def isbinary(data):
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):

View File

@ -1,7 +1,12 @@
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
import os
import zipfile
from os.path import join
from urllib.error import HTTPError
from sklearn.model_selection import StratifiedKFold
import pandas as pd
@ -17,6 +22,29 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
'semeval', 'semeval16',
'sst', 'wa', 'wb']
UCI_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3',
'breast-cancer',
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3',
#'diabetes', # <-- I haven't found this one...
'german',
'haberman',
'ionosphere',
'iris.1', 'iris.2', 'iris.3',
'mammographic',
'pageblocks.5',
#'phoneme', # <-- I haven't found this one...
'semeion',
'sonar',
'spambase',
'spectf',
'tictactoe',
'transfusion',
'wdbc',
'wine.1', 'wine.2', 'wine.3',
'wine-q-red', 'wine-q-white',
'yeast']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
@ -134,27 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
return data
UCI_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3',
'breast-cancer',
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3',
#'diabetes', # <-- I haven't found this one...
'german',
'haberman',
'ionosphere',
'iris.1', 'iris.2', 'iris.3',
'mammographic',
'pageblocks.5',
#'phoneme', # <-- I haven't found this one...
'semeion',
'sonar',
'spambase',
'spectf',
'tictactoe',
'transfusion'] # ongoing...
def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False):
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False):
assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
@ -188,7 +201,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'spambase': 'Spambase Data Set',
'spectf': 'SPECTF Heart Data',
'tictactoe': 'Tic-Tac-Toe Endgame Database',
'transfusion': 'Blood Transfusion Service Center Data Set '
'transfusion': 'Blood Transfusion Service Center Data Set',
'wdbc': 'Wisconsin Diagnostic Breast Cancer',
'wine.1': 'Wine Recognition Data (1)',
'wine.2': 'Wine Recognition Data (2)',
'wine.3': 'Wine Recognition Data (3)',
'wine-q-red': 'Wine Quality Red (6-10)',
'wine-q-white': 'Wine Quality White (6-10)',
'yeast': 'Yeast',
}
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
@ -219,7 +239,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'spambase': 'spambase',
'spectf': 'spect',
'tictactoe': 'tic-tac-toe',
'transfusion': 'blood-transfusion'
'transfusion': 'blood-transfusion',
'wdbc': 'breast-cancer-wisconsin',
'wine-q-red': 'wine-quality',
'wine-q-white': 'wine-quality',
'wine.1': 'wine',
'wine.2': 'wine',
'wine.3': 'wine',
'yeast': 'yeast',
}
# the filename is the name of the file within the data_folder indexed by the identifier
@ -231,7 +258,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'page-blocks': 'page-blocks.data.Z',
'undocumented/connectionist-bench/sonar': 'sonar.all-data',
'spect': ['SPECTF.train', 'SPECTF.test'],
'blood-transfusion': 'transfusion.data'
'blood-transfusion': 'transfusion.data',
'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
}
# the filename containing the dataset description (if any)
@ -242,7 +271,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'mammographic-masses': 'mammographic_masses.names',
'undocumented/connectionist-bench/sonar': 'sonar.names',
'spect': 'SPECTF.names',
'blood-transfusion': 'transfusion.names'
'blood-transfusion': 'transfusion.names',
'wine-quality': 'winequality.names',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
}
identifier = identifier_map[dataset_name]
@ -269,16 +300,15 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
print(f'Loading {dataset_name} ({fullname})')
if identifier == 'acute':
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
[df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes')
elif dataset_name == 'acute.b':
y = binarize(df[7], pos_class='yes')
mintemp, maxtemp = 35, 42
df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
[df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if identifier == 'balance-scale':
df = pd.read_csv(data_path, header=None, sep=',')
if dataset_name == 'balance.1':
@ -289,14 +319,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
y = binarize(df[0], pos_class='R')
X = df.loc[:, 1:].astype(float).values
if identifier == 'breast-cancer-wisconsin':
if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
df = pd.read_csv(data_path, header=None, sep=',')
Xy = df.loc[:, 1:10]
Xy[Xy=='?']=np.nan
Xy = Xy.dropna(axis=0)
X = Xy.loc[:, 1:9]
X = X.astype(float).values
y = binarize(Xy[10], pos_class=4)
y = binarize(Xy[10], pos_class=2)
if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.loc[:, 2:32].astype(float).values
y = df[1].values
y = binarize(y, pos_class='M')
if identifier == 'cmc':
df = pd.read_csv(data_path, header=None, sep=',')
@ -356,8 +392,8 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
if identifier == 'mammographic-masses':
df = pd.read_csv(data_path, header=None, sep=',')
Xy[df == '?'] = np.nan
Xy = Xy.dropna(axis=0)
df[df == '?'] = np.nan
Xy = df.dropna(axis=0)
X = Xy.iloc[:, 0:5]
X = X.astype(float).values
y = binarize(Xy.iloc[:,5], pos_class=1)
@ -395,9 +431,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
if identifier == 'spect':
dfs = []
for file in filename:
for file in filename:
data_path = join(data_dir, file)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
download_file_if_not_exists(f'{URL}/{file}', data_path)
dfs.append(pd.read_csv(data_path, header=None, sep=','))
df = pd.concat(dfs)
X = df.iloc[:, 1:45].astype(float).values
@ -416,9 +452,34 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
y = df.iloc[:, 4].values
y = binarize(y, pos_class=1)
if identifier == 'wine':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 1:14].astype(float).values
y = df[0].values
if dataset_name == 'wine.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'wine.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'wine.3':
y = binarize(y, pos_class=3)
if identifier == 'wine-quality':
filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
df = pd.read_csv(data_path, sep=';')
X = df.iloc[:, 0:11].astype(float).values
y = df.iloc[:, 11].values > 5
if identifier == 'yeast':
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
X = df.iloc[:, 1:9].astype(float).values
y = df.iloc[:, 9].values
y = binarize(y, pos_class='NUC')
data = LabelledCollection(X, y)
data.stats()
return Dataset(*data.split_stratified(1-test_split, random_state=0))
return data
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):

View File

@ -93,4 +93,5 @@ def binarize(y, pos_class):
y = np.asarray(y)
ybin = np.zeros(y.shape, dtype=np.int)
ybin[y == pos_class] = 1
return ybin
return ybin

View File

@ -9,7 +9,7 @@ from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier
from quapy.util import temp_seed
import quapy.functional as F
import pandas as pd
def artificial_sampling_prediction(
model: BaseQuantifier,
@ -62,9 +62,6 @@ def artificial_sampling_prediction(
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
# results = Parallel(n_jobs=n_jobs)(
# delayed(_predict_prevalences)(index) for index in pbar
# )
true_prevalences, estim_prevalences = zip(*results)
true_prevalences = np.asarray(true_prevalences)
@ -73,13 +70,65 @@ def artificial_sampling_prediction(
return true_prevalences, estim_prevalences
def artificial_sampling_report(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
n_prevpoints=210,
n_repetitions=1,
n_jobs=1,
random_seed=42,
error_metrics:Iterable[Union[str,Callable]]='mae',
verbose=True):
if isinstance(error_metrics, str):
error_metrics=[error_metrics]
error_names = [e if isinstance(e, str) else e.__name__ for e in error_metrics]
error_funcs = [qp.error.from_name(e) if isinstance(e, str) else e for e in error_metrics]
assert all(hasattr(e, '__call__') for e in error_funcs), 'invalid error functions'
df = pd.DataFrame(columns=['true-prev', 'estim-prev']+error_names)
true_prevs, estim_prevs = artificial_sampling_prediction(
model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
)
for true_prev, estim_prev in zip(true_prevs, estim_prevs):
series = {'true-prev': true_prev, 'estim-prev': estim_prev}
for error_name, error_metric in zip(error_names, error_funcs):
score = error_metric(true_prev, estim_prev)
series[error_name] = score
df = df.append(series, ignore_index=True)
return df
def artificial_sampling_eval(
model: BaseQuantifier,
test: LabelledCollection,
sample_size,
n_prevpoints=210,
n_repetitions=1,
n_jobs=1,
random_seed=42,
error_metric:Union[str,Callable]='mae',
verbose=True):
if isinstance(error_metric, str):
error_metric = qp.error.from_name(error_metric)
assert hasattr(error_metric, '__call__'), 'invalid error function'
true_prevs, estim_prevs = artificial_sampling_prediction(
model, test, sample_size, n_prevpoints, n_repetitions, n_jobs, random_seed, verbose
)
return error_metric(true_prevs, estim_prevs)
def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1):
if isinstance(err, str):
err = qp.error.from_name(err)
scores = qp.util.parallel(_delayed_eval, ((model, Ti, err) for Ti in test_samples), n_jobs=n_jobs)
# scores = Parallel(n_jobs=n_jobs)(
# delayed(_delayed_eval)(model, Ti, err) for Ti in test_samples
# )
return np.mean(scores)

View File

@ -38,7 +38,7 @@ class Ensemble(BaseQuantifier):
quantifier: BaseQuantifier,
size=50,
red_size=25,
min_pos=1,
min_pos=5,
policy='ave',
max_sample_size=None,
val_split=None,
@ -88,15 +88,8 @@ class Ensemble(BaseQuantifier):
)
self.ensemble = qp.util.parallel(
_delayed_new_instance,
tqdm(args, desc='fitting ensamble', total=self.size),
tqdm(args, desc='fitting ensamble', total=self.size) if self.verbose else args,
n_jobs=self.n_jobs)
# self.ensemble = Parallel(n_jobs=self.n_jobs)(
# delayed(_delayed_new_instance)(
# self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy,
# verbose=self.verbose, sample_size=sample_size
# ) for prev in tqdm(prevs, desc='fitting ensamble')
# )
# static selection policy (the name of a quantification-oriented error function to minimize)
if self.policy in qp.error.QUANTIFICATION_ERROR_NAMES:
@ -109,9 +102,6 @@ class Ensemble(BaseQuantifier):
predictions = np.asarray(
qp.util.parallel(_delayed_quantify, ((Qi, instances) for Qi in self.ensemble), n_jobs=self.n_jobs)
)
# predictions = np.asarray(Parallel(n_jobs=self.n_jobs)(
# delayed(_delayed_quantify)(Qi, instances) for Qi in self.ensemble
# ))
if self.policy == 'ptr':
predictions = self.ptr_policy(predictions)
@ -143,7 +133,7 @@ class Ensemble(BaseQuantifier):
scores.append(evaluate(model[0], tests[:i] + tests[i+1:], error, self.n_jobs))
order = np.argsort(scores)
self.ensemble = select_k(self.ensemble, order, k=self.red_size)
self.ensemble = _select_k(self.ensemble, order, k=self.red_size)
def ptr_policy(self, predictions):
"""
@ -154,7 +144,7 @@ class Ensemble(BaseQuantifier):
tr_prevs = [m[1] for m in self.ensemble]
ptr_differences = [qp.error.mse(ptr_i, test_prev_estim) for ptr_i in tr_prevs]
order = np.argsort(ptr_differences)
return select_k(predictions, order, k=self.red_size)
return _select_k(predictions, order, k=self.red_size)
def ds_policy_get_posteriors(self, data: LabelledCollection):
"""
@ -192,7 +182,7 @@ class Ensemble(BaseQuantifier):
tr_distributions = [m[2] for m in self.ensemble]
dist = [F.HellingerDistance(tr_dist_i, test_distribution) for tr_dist_i in tr_distributions]
order = np.argsort(dist)
return select_k(predictions, order, k=self.red_size)
return _select_k(predictions, order, k=self.red_size)
@property
def binary(self):
@ -201,13 +191,10 @@ class Ensemble(BaseQuantifier):
@property
def aggregative(self):
return False
#raise NotImplementedError('aggregative functionality not yet supported for Ensemble')
@property
def probabilistic(self):
return False
#raise NotImplementedError('probabilistic functionality not yet supported for Ensemble')
#return self.base_quantifier.probabilistic
def get_probability_distribution(posterior_probabilities, bins=8):
@ -217,7 +204,7 @@ def get_probability_distribution(posterior_probabilities, bins=8):
return distribution
def select_k(elements, order, k):
def _select_k(elements, order, k):
return [elements[idx] for idx in order[:k]]

39
test.py
View File

@ -8,15 +8,48 @@ import numpy as np
from NewMethods.methods import AveragePoolQuantification
from classification.methods import PCALR
from classification.neural import NeuralClassifierTrainer, CNNnet
from data import Dataset
from method.meta import EPACC
from quapy.model_selection import GridSearchQ
from tqdm import tqdm
import pandas as pd
sample_size=100
qp.environ['SAMPLE_SIZE'] = sample_size
np.random.seed(0)
nfolds=5
nrepeats=1
df = pd.DataFrame(columns=['dataset', 'method', 'mse'])
for datasetname in qp.datasets.UCI_DATASETS[2:]:
collection = qp.datasets.fetch_UCILabelledCollection(datasetname, verbose=False)
scores = []
pbar = tqdm(Dataset.kFCV(collection, nfolds=nfolds, nrepeats=nrepeats), total=nfolds*nrepeats)
for data in pbar:
pbar.set_description(f'{data.name}')
# learner = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid={'C': np.logspace(-3,3,7)}, n_jobs=-1)
learner = LogisticRegression(class_weight='balanced')
# model = qp.method.aggregative.CC(learner)
model = qp.method.meta.EHDy(learner, size=30, red_size=15, verbose=False)
model.fit(data.training)
err = qp.evaluation.artificial_sampling_eval(model, data.test, sample_size, n_prevpoints=101, n_jobs=-1,
error_metric='mse', verbose=False)
scores.append(err)
score = np.mean(scores)
df = df.append({
'dataset': datasetname,
'method': model.__class__.__name__,
'mse': score
}, ignore_index=True)
print(df)
dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True)
sys.exit(0)
qp.environ['SAMPLE_SIZE'] = 500
#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
max_evaluations = 500