1
0
Fork 0

cleaning and adding some uci datasets

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-11 12:55:06 +01:00
parent d1b449d2e9
commit 41347b50f9
3 changed files with 128 additions and 35 deletions

View File

@ -120,7 +120,10 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
UCI_DATASETS = ['acute.a', 'acute.b', UCI_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3'] 'balance.1', 'balance.2', 'balance.3',
'breast-cancer',
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
@ -136,6 +139,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
'balance.1': 'balance-scale', 'balance.1': 'balance-scale',
'balance.2': 'balance-scale', 'balance.2': 'balance-scale',
'balance.3': 'balance-scale', 'balance.3': 'balance-scale',
'breast-cancer': 'breast-cancer-wisconsin',
'cmc.1': 'cmc',
'cmc.2': 'cmc',
'cmc.3': 'cmc',
'ctg.1': 'ctg',
'ctg.2': 'ctg',
'ctg.3': 'ctg',
} }
dataset_fullname = { dataset_fullname = {
@ -144,11 +155,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
'balance.1': 'Balance Scale Weight & Distance Database (left)', 'balance.1': 'Balance Scale Weight & Distance Database (left)',
'balance.2': 'Balance Scale Weight & Distance Database (balanced)', 'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
'balance.3': 'Balance Scale Weight & Distance Database (right)', 'balance.3': 'Balance Scale Weight & Distance Database (right)',
'breast-cancer': 'Breast Cancer Wisconsin (Original)',
'cmc.1': 'Contraceptive Method Choice (no use)',
'cmc.2': 'Contraceptive Method Choice (long term)',
'cmc.3': 'Contraceptive Method Choice (short term)',
'ctg.1': 'Cardiotocography Data Set (normal)',
'ctg.2': 'Cardiotocography Data Set (suspect)',
'ctg.3': 'Cardiotocography Data Set (pathologic)',
} }
data_folder = { data_folder = {
'acute': 'diagnosis', 'acute': 'diagnosis',
'balance-scale': 'balance-scale', 'balance-scale': 'balance-scale',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
'cmc': 'cmc'
} }
identifier = identifier_map[dataset_name] identifier = identifier_map[dataset_name]
@ -183,8 +203,29 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
y = binarize(df[0], pos_class='R') y = binarize(df[0], pos_class='R')
X = df.loc[:, 1:].astype(float).values X = df.loc[:, 1:].astype(float).values
if identifier == 'breast-cancer-wisconsin':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
Xy = df.loc[:, 1:10]
Xy[Xy=='?']=np.nan
Xy = Xy.dropna(axis=0)
X = Xy.loc[:, 1:9]
X = X.astype(float).values
y = binarize(Xy[10], pos_class=4)
if identifier == 'cmc':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
X = df.loc[:, 0:8].astype(float).values
y = df[9].astype(int).values
if dataset_name == 'cmc.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'cmc.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'cmc.3':
y = binarize(y, pos_class=3)
data = LabelledCollection(X, y) data = LabelledCollection(X, y)
data.stats() data.stats()
raise NotImplementedError()
#print(df) #print(df)
#print(df.loc[:, 0:5].values) #print(df.loc[:, 0:5].values)
#print(y) #print(y)

View File

@ -11,6 +11,8 @@ from sklearn.calibration import CalibratedClassifierCV
from joblib import Parallel, delayed from joblib import Parallel, delayed
from abc import abstractmethod from abc import abstractmethod
from typing import Union from typing import Union
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
# Abstract classes # Abstract classes
@ -115,8 +117,8 @@ def training_helper(learner,
train = data train = data
unused = val_split unused = val_split
else: else:
raise ValueError('train_val_split not understood; use either a float indicating the split proportion, ' raise ValueError('param "val_split" not understood; use either a float indicating the split '
'or a LabelledCollection indicating the validation split') 'proportion, or a LabelledCollection indicating the validation split')
else: else:
train, unused = data, None train, unused = data, None
learner.fit(train.instances, train.labels) learner.fit(train.instances, train.labels)
@ -159,23 +161,49 @@ class ACC(AggregativeQuantifier):
def __init__(self, learner:BaseEstimator): def __init__(self, learner:BaseEstimator):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
""" """
Trains a ACC quantifier Trains a ACC quantifier
:param data: the training set :param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
:return: self :return: self
""" """
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split) if isinstance(val_split, int):
# kFCV estimation of parameters
y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
y_.append(learner.predict(val_data.instances))
y.append(val_data.labels)
y = np.concatenate(y)
y_ = np.concatenate(y_)
class_count = data.counts()
# fit the learner on all data
self.learner.fit(*data.Xy)
else:
self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split)
y_ = self.learner.predict(val_data.instances)
y = val_data.labels
class_count = val_data.counts()
self.cc = CC(self.learner) self.cc = CC(self.learner)
y_ = self.classify(validation.instances)
y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi # document that belongs to yj ends up being classified as belonging to yi
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts() self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
return self return self
def classify(self, data): def classify(self, data):
@ -216,33 +244,53 @@ class PACC(AggregativeProbabilisticQuantifier):
def __init__(self, learner:BaseEstimator): def __init__(self, learner:BaseEstimator):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
""" """
Trains a PACC quantifier Trains a PACC quantifier
:param data: the training set :param data: the training set
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
:return: self :return: self
""" """
self.learner, validation = training_helper( if isinstance(val_split, int):
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split # kFCV estimation of parameters
) y, y_ = [], []
kfcv = StratifiedKFold(n_splits=val_split)
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
for k, (training_idx, validation_idx) in enumerate(pbar):
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
training = data.sampling_from_index(training_idx)
validation = data.sampling_from_index(validation_idx)
learner, val_data = training_helper(
self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
y_.append(learner.predict_proba(val_data.instances))
y.append(val_data.labels)
y = np.concatenate(y)
y_ = np.vstack(y_)
# fit the learner on all data
self.learner.fit(*data.Xy)
else:
self.learner, val_data = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
y_ = self.learner.predict_proba(val_data.instances)
y = val_data.labels
self.pcc = PCC(self.learner) self.pcc = PCC(self.learner)
y_ = self.soft_classify(validation.instances)
y = validation.labels # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
confusion = np.empty(shape=(data.n_classes, data.n_classes)) confusion = np.empty(shape=(data.n_classes, data.n_classes))
for yi in range(data.n_classes): for yi in range(data.n_classes):
confusion[yi] = y_[y==yi].mean(axis=0) confusion[yi] = y_[y==yi].mean(axis=0)
self.Pte_cond_estim_ = confusion.T self.Pte_cond_estim_ = confusion.T
#y_ = self.classify(validation.instances)
#y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
#self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
return self return self
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
@ -261,7 +309,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
MAX_ITER = 1000 MAX_ITER = 1000
EPSILON = 1e-4 EPSILON = 1e-4
def __init__(self, learner:BaseEstimator): def __init__(self, learner: BaseEstimator):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True): def fit(self, data: LabelledCollection, fit_learner=True):
@ -307,10 +355,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
estimation based on the Hellinger distance. Information Sciences, 218:146164. estimation based on the Hellinger distance. Information Sciences, 218:146164.
""" """
def __init__(self, learner:BaseEstimator): def __init__(self, learner: BaseEstimator):
self.learner = learner self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3):
""" """
Trains a HDy quantifier Trains a HDy quantifier
:param data: the training set :param data: the training set
@ -404,9 +452,9 @@ ClassifyAndCount = CC
AdjustedClassifyAndCount = ACC AdjustedClassifyAndCount = ACC
ProbabilisticClassifyAndCount = PCC ProbabilisticClassifyAndCount = PCC
ProbabilisticAdjustedClassifyAndCount = PACC ProbabilisticAdjustedClassifyAndCount = PACC
ExplicitLossMinimisation = ELM
ExpectationMaximizationQuantifier = EMQ ExpectationMaximizationQuantifier = EMQ
HellingerDistanceY = HDy HellingerDistanceY = HDy
ExplicitLossMinimisation = ELM
class OneVsAll(AggregativeQuantifier): class OneVsAll(AggregativeQuantifier):
@ -436,6 +484,9 @@ class OneVsAll(AggregativeQuantifier):
return self return self
def classify(self, instances): def classify(self, instances):
# returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry
# (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are
# independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances) classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
return classif_predictions_bin.T return classif_predictions_bin.T
@ -475,10 +526,12 @@ class OneVsAll(AggregativeQuantifier):
return self.dict_binary_quantifiers[c].classify(X) return self.dict_binary_quantifiers[c].classify(X)
def _delayed_binary_quantify(self, c, X): def _delayed_binary_quantify(self, c, X):
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence # the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].quantify(X)[1]
def _delayed_binary_aggregate(self, c, classif_predictions): def _delayed_binary_aggregate(self, c, classif_predictions):
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence # the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
def _delayed_binary_fit(self, c, data): def _delayed_binary_fit(self, c, data):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)

View File

@ -55,15 +55,15 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
save_or_show(savepath) save_or_show(savepath)
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10, def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
vertical_xticks=False, savepath=None): vertical_xticks=False, savepath=None):
from pylab import boxplot, plot, setp from pylab import boxplot, plot, setp
fig, ax = plt.subplots() fig, ax = plt.subplots()
ax.grid() ax.grid()
bins = np.linspace(0, 1, nbins) bins = np.linspace(0, 1, nbins+1)
binwidth = 1/(nbins - 1) binwidth = 1/nbins
data = {} data = {}
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
true_prev = true_prev[:,pos_class] true_prev = true_prev[:,pos_class]
@ -110,7 +110,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
# set_visible to False for all but the first element) after the legend has been placed # set_visible to False for all but the first element) after the legend has been placed
hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]] hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
for colorid in range(len(method_names)): for colorid in range(len(method_names)):
h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k', h, = plot([0, 0], '-s', markerfacecolor=colormap.colors[colorid], color='k',
mec=colormap.colors[colorid], linewidth=1.) mec=colormap.colors[colorid], linewidth=1.)
hs.append(h) hs.append(h)
box = ax.get_position() box = ax.get_position()
@ -126,7 +126,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
save_or_show(savepath) save_or_show(savepath)
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True, def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=True,
title=f'Quantification error as a function of distribution shift', title=f'Quantification error as a function of distribution shift',
savepath=None): savepath=None):
@ -135,7 +135,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
x_error = qp.error.ae x_error = qp.error.ae
y_error = getattr(qp.error, error_name) y_error = getattr(qp.error, error_name)
ndims = tr_prevs[0].shape[-1]
# join all data, and keep the order in which the methods appeared for the first time # join all data, and keep the order in which the methods appeared for the first time
data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))}) data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
@ -152,8 +151,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
if method not in method_order: if method not in method_order:
method_order.append(method) method_order.append(method)
bins = np.linspace(0, 1, n_bins) bins = np.linspace(0, 1, n_bins+1)
binwidth = 1 / (n_bins - 1) binwidth = 1 / n_bins
min_x, max_x = None, None min_x, max_x = None, None
for method in method_order: for method in method_order:
tr_test_drifts = data[method]['x'] tr_test_drifts = data[method]['x']