forked from moreo/QuaPy
cleaning and adding some uci datasets
This commit is contained in:
parent
d1b449d2e9
commit
41347b50f9
|
@ -120,7 +120,10 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
||||||
|
|
||||||
|
|
||||||
UCI_DATASETS = ['acute.a', 'acute.b',
|
UCI_DATASETS = ['acute.a', 'acute.b',
|
||||||
'balance.1', 'balance.2', 'balance.3']
|
'balance.1', 'balance.2', 'balance.3',
|
||||||
|
'breast-cancer',
|
||||||
|
'cmc.1', 'cmc.2', 'cmc.3',
|
||||||
|
'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...
|
||||||
|
|
||||||
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
||||||
|
|
||||||
|
@ -136,6 +139,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
||||||
'balance.1': 'balance-scale',
|
'balance.1': 'balance-scale',
|
||||||
'balance.2': 'balance-scale',
|
'balance.2': 'balance-scale',
|
||||||
'balance.3': 'balance-scale',
|
'balance.3': 'balance-scale',
|
||||||
|
'breast-cancer': 'breast-cancer-wisconsin',
|
||||||
|
'cmc.1': 'cmc',
|
||||||
|
'cmc.2': 'cmc',
|
||||||
|
'cmc.3': 'cmc',
|
||||||
|
'ctg.1': 'ctg',
|
||||||
|
'ctg.2': 'ctg',
|
||||||
|
'ctg.3': 'ctg',
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dataset_fullname = {
|
dataset_fullname = {
|
||||||
|
@ -144,11 +155,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
||||||
'balance.1': 'Balance Scale Weight & Distance Database (left)',
|
'balance.1': 'Balance Scale Weight & Distance Database (left)',
|
||||||
'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
|
'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
|
||||||
'balance.3': 'Balance Scale Weight & Distance Database (right)',
|
'balance.3': 'Balance Scale Weight & Distance Database (right)',
|
||||||
|
'breast-cancer': 'Breast Cancer Wisconsin (Original)',
|
||||||
|
'cmc.1': 'Contraceptive Method Choice (no use)',
|
||||||
|
'cmc.2': 'Contraceptive Method Choice (long term)',
|
||||||
|
'cmc.3': 'Contraceptive Method Choice (short term)',
|
||||||
|
'ctg.1': 'Cardiotocography Data Set (normal)',
|
||||||
|
'ctg.2': 'Cardiotocography Data Set (suspect)',
|
||||||
|
'ctg.3': 'Cardiotocography Data Set (pathologic)',
|
||||||
}
|
}
|
||||||
|
|
||||||
data_folder = {
|
data_folder = {
|
||||||
'acute': 'diagnosis',
|
'acute': 'diagnosis',
|
||||||
'balance-scale': 'balance-scale',
|
'balance-scale': 'balance-scale',
|
||||||
|
'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
|
||||||
|
'cmc': 'cmc'
|
||||||
}
|
}
|
||||||
|
|
||||||
identifier = identifier_map[dataset_name]
|
identifier = identifier_map[dataset_name]
|
||||||
|
@ -183,8 +203,29 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
||||||
y = binarize(df[0], pos_class='R')
|
y = binarize(df[0], pos_class='R')
|
||||||
X = df.loc[:, 1:].astype(float).values
|
X = df.loc[:, 1:].astype(float).values
|
||||||
|
|
||||||
|
if identifier == 'breast-cancer-wisconsin':
|
||||||
|
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
|
||||||
|
Xy = df.loc[:, 1:10]
|
||||||
|
Xy[Xy=='?']=np.nan
|
||||||
|
Xy = Xy.dropna(axis=0)
|
||||||
|
X = Xy.loc[:, 1:9]
|
||||||
|
X = X.astype(float).values
|
||||||
|
y = binarize(Xy[10], pos_class=4)
|
||||||
|
|
||||||
|
if identifier == 'cmc':
|
||||||
|
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
|
||||||
|
X = df.loc[:, 0:8].astype(float).values
|
||||||
|
y = df[9].astype(int).values
|
||||||
|
if dataset_name == 'cmc.1':
|
||||||
|
y = binarize(y, pos_class=1)
|
||||||
|
elif dataset_name == 'cmc.2':
|
||||||
|
y = binarize(y, pos_class=2)
|
||||||
|
elif dataset_name == 'cmc.3':
|
||||||
|
y = binarize(y, pos_class=3)
|
||||||
|
|
||||||
data = LabelledCollection(X, y)
|
data = LabelledCollection(X, y)
|
||||||
data.stats()
|
data.stats()
|
||||||
|
raise NotImplementedError()
|
||||||
#print(df)
|
#print(df)
|
||||||
#print(df.loc[:, 0:5].values)
|
#print(df.loc[:, 0:5].values)
|
||||||
#print(y)
|
#print(y)
|
||||||
|
|
|
@ -11,6 +11,8 @@ from sklearn.calibration import CalibratedClassifierCV
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
# Abstract classes
|
# Abstract classes
|
||||||
|
@ -115,8 +117,8 @@ def training_helper(learner,
|
||||||
train = data
|
train = data
|
||||||
unused = val_split
|
unused = val_split
|
||||||
else:
|
else:
|
||||||
raise ValueError('train_val_split not understood; use either a float indicating the split proportion, '
|
raise ValueError('param "val_split" not understood; use either a float indicating the split '
|
||||||
'or a LabelledCollection indicating the validation split')
|
'proportion, or a LabelledCollection indicating the validation split')
|
||||||
else:
|
else:
|
||||||
train, unused = data, None
|
train, unused = data, None
|
||||||
learner.fit(train.instances, train.labels)
|
learner.fit(train.instances, train.labels)
|
||||||
|
@ -159,23 +161,49 @@ class ACC(AggregativeQuantifier):
|
||||||
def __init__(self, learner:BaseEstimator):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
|
||||||
"""
|
"""
|
||||||
Trains a ACC quantifier
|
Trains a ACC quantifier
|
||||||
:param data: the training set
|
:param data: the training set
|
||||||
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
indicating the validation set itself
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
:return: self
|
:return: self
|
||||||
"""
|
"""
|
||||||
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
|
if isinstance(val_split, int):
|
||||||
|
# kFCV estimation of parameters
|
||||||
|
y, y_ = [], []
|
||||||
|
kfcv = StratifiedKFold(n_splits=val_split)
|
||||||
|
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||||
|
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||||
|
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||||
|
training = data.sampling_from_index(training_idx)
|
||||||
|
validation = data.sampling_from_index(validation_idx)
|
||||||
|
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
|
||||||
|
y_.append(learner.predict(val_data.instances))
|
||||||
|
y.append(val_data.labels)
|
||||||
|
|
||||||
|
y = np.concatenate(y)
|
||||||
|
y_ = np.concatenate(y_)
|
||||||
|
class_count = data.counts()
|
||||||
|
|
||||||
|
# fit the learner on all data
|
||||||
|
self.learner.fit(*data.Xy)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split)
|
||||||
|
y_ = self.learner.predict(val_data.instances)
|
||||||
|
y = val_data.labels
|
||||||
|
class_count = val_data.counts()
|
||||||
|
|
||||||
self.cc = CC(self.learner)
|
self.cc = CC(self.learner)
|
||||||
y_ = self.classify(validation.instances)
|
|
||||||
y = validation.labels
|
|
||||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||||
# document that belongs to yj ends up being classified as belonging to yi
|
# document that belongs to yj ends up being classified as belonging to yi
|
||||||
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
|
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def classify(self, data):
|
def classify(self, data):
|
||||||
|
@ -216,33 +244,53 @@ class PACC(AggregativeProbabilisticQuantifier):
|
||||||
def __init__(self, learner:BaseEstimator):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
|
||||||
"""
|
"""
|
||||||
Trains a PACC quantifier
|
Trains a PACC quantifier
|
||||||
:param data: the training set
|
:param data: the training set
|
||||||
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
indicating the validation set itself
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
:return: self
|
:return: self
|
||||||
"""
|
"""
|
||||||
self.learner, validation = training_helper(
|
if isinstance(val_split, int):
|
||||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
|
# kFCV estimation of parameters
|
||||||
)
|
y, y_ = [], []
|
||||||
|
kfcv = StratifiedKFold(n_splits=val_split)
|
||||||
|
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||||
|
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||||
|
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||||
|
training = data.sampling_from_index(training_idx)
|
||||||
|
validation = data.sampling_from_index(validation_idx)
|
||||||
|
learner, val_data = training_helper(
|
||||||
|
self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
|
||||||
|
y_.append(learner.predict_proba(val_data.instances))
|
||||||
|
y.append(val_data.labels)
|
||||||
|
|
||||||
|
y = np.concatenate(y)
|
||||||
|
y_ = np.vstack(y_)
|
||||||
|
|
||||||
|
# fit the learner on all data
|
||||||
|
self.learner.fit(*data.Xy)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.learner, val_data = training_helper(
|
||||||
|
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
||||||
|
y_ = self.learner.predict_proba(val_data.instances)
|
||||||
|
y = val_data.labels
|
||||||
|
|
||||||
self.pcc = PCC(self.learner)
|
self.pcc = PCC(self.learner)
|
||||||
y_ = self.soft_classify(validation.instances)
|
|
||||||
y = validation.labels
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||||
|
# document that belongs to yj ends up being classified as belonging to yi
|
||||||
confusion = np.empty(shape=(data.n_classes, data.n_classes))
|
confusion = np.empty(shape=(data.n_classes, data.n_classes))
|
||||||
for yi in range(data.n_classes):
|
for yi in range(data.n_classes):
|
||||||
confusion[yi] = y_[y==yi].mean(axis=0)
|
confusion[yi] = y_[y==yi].mean(axis=0)
|
||||||
|
|
||||||
self.Pte_cond_estim_ = confusion.T
|
self.Pte_cond_estim_ = confusion.T
|
||||||
|
|
||||||
#y_ = self.classify(validation.instances)
|
|
||||||
#y = validation.labels
|
|
||||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
|
||||||
# document that belongs to yj ends up being classified as belonging to yi
|
|
||||||
#self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors):
|
def aggregate(self, classif_posteriors):
|
||||||
|
@ -404,9 +452,9 @@ ClassifyAndCount = CC
|
||||||
AdjustedClassifyAndCount = ACC
|
AdjustedClassifyAndCount = ACC
|
||||||
ProbabilisticClassifyAndCount = PCC
|
ProbabilisticClassifyAndCount = PCC
|
||||||
ProbabilisticAdjustedClassifyAndCount = PACC
|
ProbabilisticAdjustedClassifyAndCount = PACC
|
||||||
ExplicitLossMinimisation = ELM
|
|
||||||
ExpectationMaximizationQuantifier = EMQ
|
ExpectationMaximizationQuantifier = EMQ
|
||||||
HellingerDistanceY = HDy
|
HellingerDistanceY = HDy
|
||||||
|
ExplicitLossMinimisation = ELM
|
||||||
|
|
||||||
|
|
||||||
class OneVsAll(AggregativeQuantifier):
|
class OneVsAll(AggregativeQuantifier):
|
||||||
|
@ -436,6 +484,9 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def classify(self, instances):
|
def classify(self, instances):
|
||||||
|
# returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry
|
||||||
|
# (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are
|
||||||
|
# independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
|
||||||
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
|
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
|
||||||
return classif_predictions_bin.T
|
return classif_predictions_bin.T
|
||||||
|
|
||||||
|
@ -475,10 +526,12 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
return self.dict_binary_quantifiers[c].classify(X)
|
return self.dict_binary_quantifiers[c].classify(X)
|
||||||
|
|
||||||
def _delayed_binary_quantify(self, c, X):
|
def _delayed_binary_quantify(self, c, X):
|
||||||
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence
|
# the estimation for the positive class prevalence
|
||||||
|
return self.dict_binary_quantifiers[c].quantify(X)[1]
|
||||||
|
|
||||||
def _delayed_binary_aggregate(self, c, classif_predictions):
|
def _delayed_binary_aggregate(self, c, classif_predictions):
|
||||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
|
# the estimation for the positive class prevalence
|
||||||
|
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
||||||
|
|
||||||
def _delayed_binary_fit(self, c, data):
|
def _delayed_binary_fit(self, c, data):
|
||||||
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||||
|
|
|
@ -55,15 +55,15 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
|
||||||
save_or_show(savepath)
|
save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10,
|
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
|
||||||
vertical_xticks=False, savepath=None):
|
vertical_xticks=False, savepath=None):
|
||||||
from pylab import boxplot, plot, setp
|
from pylab import boxplot, plot, setp
|
||||||
|
|
||||||
fig, ax = plt.subplots()
|
fig, ax = plt.subplots()
|
||||||
ax.grid()
|
ax.grid()
|
||||||
|
|
||||||
bins = np.linspace(0, 1, nbins)
|
bins = np.linspace(0, 1, nbins+1)
|
||||||
binwidth = 1/(nbins - 1)
|
binwidth = 1/nbins
|
||||||
data = {}
|
data = {}
|
||||||
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||||
true_prev = true_prev[:,pos_class]
|
true_prev = true_prev[:,pos_class]
|
||||||
|
@ -110,7 +110,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
||||||
# set_visible to False for all but the first element) after the legend has been placed
|
# set_visible to False for all but the first element) after the legend has been placed
|
||||||
hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
|
hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
|
||||||
for colorid in range(len(method_names)):
|
for colorid in range(len(method_names)):
|
||||||
h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k',
|
h, = plot([0, 0], '-s', markerfacecolor=colormap.colors[colorid], color='k',
|
||||||
mec=colormap.colors[colorid], linewidth=1.)
|
mec=colormap.colors[colorid], linewidth=1.)
|
||||||
hs.append(h)
|
hs.append(h)
|
||||||
box = ax.get_position()
|
box = ax.get_position()
|
||||||
|
@ -126,7 +126,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
||||||
save_or_show(savepath)
|
save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True,
|
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=True,
|
||||||
title=f'Quantification error as a function of distribution shift',
|
title=f'Quantification error as a function of distribution shift',
|
||||||
savepath=None):
|
savepath=None):
|
||||||
|
|
||||||
|
@ -135,7 +135,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
|
||||||
|
|
||||||
x_error = qp.error.ae
|
x_error = qp.error.ae
|
||||||
y_error = getattr(qp.error, error_name)
|
y_error = getattr(qp.error, error_name)
|
||||||
ndims = tr_prevs[0].shape[-1]
|
|
||||||
|
|
||||||
# join all data, and keep the order in which the methods appeared for the first time
|
# join all data, and keep the order in which the methods appeared for the first time
|
||||||
data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
|
data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
|
||||||
|
@ -152,8 +151,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
|
||||||
if method not in method_order:
|
if method not in method_order:
|
||||||
method_order.append(method)
|
method_order.append(method)
|
||||||
|
|
||||||
bins = np.linspace(0, 1, n_bins)
|
bins = np.linspace(0, 1, n_bins+1)
|
||||||
binwidth = 1 / (n_bins - 1)
|
binwidth = 1 / n_bins
|
||||||
min_x, max_x = None, None
|
min_x, max_x = None, None
|
||||||
for method in method_order:
|
for method in method_order:
|
||||||
tr_test_drifts = data[method]['x']
|
tr_test_drifts = data[method]['x']
|
||||||
|
|
Loading…
Reference in New Issue