cleaning and adding some uci datasets

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-11 12:55:06 +01:00
parent d1b449d2e9
commit 41347b50f9
3 changed files with 128 additions and 35 deletions

View File

@ -120,7 +120,10 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
UCI_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3']
'balance.1', 'balance.2', 'balance.3',
'breast-cancer',
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
@ -136,6 +139,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
'balance.1': 'balance-scale',
'balance.2': 'balance-scale',
'balance.3': 'balance-scale',
'breast-cancer': 'breast-cancer-wisconsin',
'cmc.1': 'cmc',
'cmc.2': 'cmc',
'cmc.3': 'cmc',
'ctg.1': 'ctg',
'ctg.2': 'ctg',
'ctg.3': 'ctg',
}
dataset_fullname = {
@ -144,11 +155,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
'balance.1': 'Balance Scale Weight & Distance Database (left)',
'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
'balance.3': 'Balance Scale Weight & Distance Database (right)',
'breast-cancer': 'Breast Cancer Wisconsin (Original)',
'cmc.1': 'Contraceptive Method Choice (no use)',
'cmc.2': 'Contraceptive Method Choice (long term)',
'cmc.3': 'Contraceptive Method Choice (short term)',
'ctg.1': 'Cardiotocography Data Set (normal)',
'ctg.2': 'Cardiotocography Data Set (suspect)',
'ctg.3': 'Cardiotocography Data Set (pathologic)',
}
data_folder = {
'acute': 'diagnosis',
'balance-scale': 'balance-scale',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
'cmc': 'cmc'
}
identifier = identifier_map[dataset_name]
@ -183,8 +203,29 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
y = binarize(df[0], pos_class='R')
X = df.loc[:, 1:].astype(float).values
if identifier == 'breast-cancer-wisconsin':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
Xy = df.loc[:, 1:10]
Xy[Xy=='?']=np.nan
Xy = Xy.dropna(axis=0)
X = Xy.loc[:, 1:9]
X = X.astype(float).values
y = binarize(Xy[10], pos_class=4)
if identifier == 'cmc':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
X = df.loc[:, 0:8].astype(float).values
y = df[9].astype(int).values
if dataset_name == 'cmc.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'cmc.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'cmc.3':
y = binarize(y, pos_class=3)
data = LabelledCollection(X, y)
data.stats()
raise NotImplementedError()
#print(df)
#print(df.loc[:, 0:5].values)
#print(y)

View File

@ -11,6 +11,8 @@ from sklearn.calibration import CalibratedClassifierCV
from joblib import Parallel, delayed
from abc import abstractmethod
from typing import Union
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
# Abstract classes
@ -115,8 +117,8 @@ def training_helper(learner,
train = data
unused = val_split
else:
raise ValueError('train_val_split not understood; use either a float indicating the split proportion, '
'or a LabelledCollection indicating the validation split')
raise ValueError('param "val_split" not understood; use either a float indicating the split '
'proportion, or a LabelledCollection indicating the validation split')
else:
train, unused = data, None
learner.fit(train.instances, train.labels)
@ -159,23 +161,49 @@ class ACC(AggregativeQuantifier):
def __init__(self, learner:BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
"""
Trains a ACC quantifier
:param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
:return: self
"""
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
if isinstance(val_split, int):
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
y_.append(learner.predict(val_data.instances))
y.append(val_data.labels)
y = np.concatenate(y)
y_ = np.concatenate(y_)
class_count = data.counts()
# fit the learner on all data
self.learner.fit(*data.Xy)
else:
self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split)
y_ = self.learner.predict(val_data.instances)
y = val_data.labels
class_count = val_data.counts()
self.cc = CC(self.learner)
y_ = self.classify(validation.instances)
y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
return self
def classify(self, data):
@ -216,33 +244,53 @@ class PACC(AggregativeProbabilisticQuantifier):
def __init__(self, learner:BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
"""
Trains a PACC quantifier
:param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
:return: self
"""
self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
)
if isinstance(val_split, int):
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = training_helper(
self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
y_.append(learner.predict_proba(val_data.instances))
y.append(val_data.labels)
y = np.concatenate(y)
y_ = np.vstack(y_)
# fit the learner on all data
self.learner.fit(*data.Xy)
else:
self.learner, val_data = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
y_ = self.learner.predict_proba(val_data.instances)
y = val_data.labels
self.pcc = PCC(self.learner)
y_ = self.soft_classify(validation.instances)
y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
confusion = np.empty(shape=(data.n_classes, data.n_classes))
for yi in range(data.n_classes):
confusion[yi] = y_[y==yi].mean(axis=0)
self.Pte_cond_estim_ = confusion.T
#y_ = self.classify(validation.instances)
#y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
#self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
return self
def aggregate(self, classif_posteriors):
@ -261,7 +309,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
MAX_ITER = 1000
EPSILON = 1e-4
def __init__(self, learner:BaseEstimator):
def __init__(self, learner: BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True):
@ -307,10 +355,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
estimation based on the Hellinger distance. Information Sciences, 218:146164.
"""
def __init__(self, learner:BaseEstimator):
def __init__(self, learner: BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3):
"""
Trains a HDy quantifier
:param data: the training set
@ -404,9 +452,9 @@ ClassifyAndCount = CC
AdjustedClassifyAndCount = ACC
ProbabilisticClassifyAndCount = PCC
ProbabilisticAdjustedClassifyAndCount = PACC
ExplicitLossMinimisation = ELM
ExpectationMaximizationQuantifier = EMQ
HellingerDistanceY = HDy
ExplicitLossMinimisation = ELM
class OneVsAll(AggregativeQuantifier):
@ -436,6 +484,9 @@ class OneVsAll(AggregativeQuantifier):
return self
def classify(self, instances):
# returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry
# (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are
# independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
return classif_predictions_bin.T
@ -475,10 +526,12 @@ class OneVsAll(AggregativeQuantifier):
return self.dict_binary_quantifiers[c].classify(X)
def _delayed_binary_quantify(self, c, X):
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence
# the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].quantify(X)[1]
def _delayed_binary_aggregate(self, c, classif_predictions):
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
# the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
def _delayed_binary_fit(self, c, data):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)

View File

@ -55,15 +55,15 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
save_or_show(savepath)
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10,
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
vertical_xticks=False, savepath=None):
from pylab import boxplot, plot, setp
fig, ax = plt.subplots()
ax.grid()
bins = np.linspace(0, 1, nbins)
binwidth = 1/(nbins - 1)
bins = np.linspace(0, 1, nbins+1)
binwidth = 1/nbins
data = {}
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
true_prev = true_prev[:,pos_class]
@ -110,7 +110,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
# set_visible to False for all but the first element) after the legend has been placed
hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
for colorid in range(len(method_names)):
h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k',
h, = plot([0, 0], '-s', markerfacecolor=colormap.colors[colorid], color='k',
mec=colormap.colors[colorid], linewidth=1.)
hs.append(h)
box = ax.get_position()
@ -126,7 +126,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
save_or_show(savepath)
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True,
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=True,
title=f'Quantification error as a function of distribution shift',
savepath=None):
@ -135,7 +135,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
x_error = qp.error.ae
y_error = getattr(qp.error, error_name)
ndims = tr_prevs[0].shape[-1]
# join all data, and keep the order in which the methods appeared for the first time
data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
@ -152,8 +151,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
if method not in method_order:
method_order.append(method)
bins = np.linspace(0, 1, n_bins)
binwidth = 1 / (n_bins - 1)
bins = np.linspace(0, 1, n_bins+1)
binwidth = 1 / n_bins
min_x, max_x = None, None
for method in method_order:
tr_test_drifts = data[method]['x']