forked from moreo/QuaPy
cleaning and adding some uci datasets
This commit is contained in:
parent
d1b449d2e9
commit
41347b50f9
|
@ -120,7 +120,10 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
|
|||
|
||||
|
||||
UCI_DATASETS = ['acute.a', 'acute.b',
|
||||
'balance.1', 'balance.2', 'balance.3']
|
||||
'balance.1', 'balance.2', 'balance.3',
|
||||
'breast-cancer',
|
||||
'cmc.1', 'cmc.2', 'cmc.3',
|
||||
'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...
|
||||
|
||||
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
||||
|
||||
|
@ -136,6 +139,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
'balance.1': 'balance-scale',
|
||||
'balance.2': 'balance-scale',
|
||||
'balance.3': 'balance-scale',
|
||||
'breast-cancer': 'breast-cancer-wisconsin',
|
||||
'cmc.1': 'cmc',
|
||||
'cmc.2': 'cmc',
|
||||
'cmc.3': 'cmc',
|
||||
'ctg.1': 'ctg',
|
||||
'ctg.2': 'ctg',
|
||||
'ctg.3': 'ctg',
|
||||
|
||||
}
|
||||
|
||||
dataset_fullname = {
|
||||
|
@ -144,11 +155,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
'balance.1': 'Balance Scale Weight & Distance Database (left)',
|
||||
'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
|
||||
'balance.3': 'Balance Scale Weight & Distance Database (right)',
|
||||
'breast-cancer': 'Breast Cancer Wisconsin (Original)',
|
||||
'cmc.1': 'Contraceptive Method Choice (no use)',
|
||||
'cmc.2': 'Contraceptive Method Choice (long term)',
|
||||
'cmc.3': 'Contraceptive Method Choice (short term)',
|
||||
'ctg.1': 'Cardiotocography Data Set (normal)',
|
||||
'ctg.2': 'Cardiotocography Data Set (suspect)',
|
||||
'ctg.3': 'Cardiotocography Data Set (pathologic)',
|
||||
}
|
||||
|
||||
data_folder = {
|
||||
'acute': 'diagnosis',
|
||||
'balance-scale': 'balance-scale',
|
||||
'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
|
||||
'cmc': 'cmc'
|
||||
}
|
||||
|
||||
identifier = identifier_map[dataset_name]
|
||||
|
@ -183,8 +203,29 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
|
|||
y = binarize(df[0], pos_class='R')
|
||||
X = df.loc[:, 1:].astype(float).values
|
||||
|
||||
if identifier == 'breast-cancer-wisconsin':
|
||||
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
|
||||
Xy = df.loc[:, 1:10]
|
||||
Xy[Xy=='?']=np.nan
|
||||
Xy = Xy.dropna(axis=0)
|
||||
X = Xy.loc[:, 1:9]
|
||||
X = X.astype(float).values
|
||||
y = binarize(Xy[10], pos_class=4)
|
||||
|
||||
if identifier == 'cmc':
|
||||
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
|
||||
X = df.loc[:, 0:8].astype(float).values
|
||||
y = df[9].astype(int).values
|
||||
if dataset_name == 'cmc.1':
|
||||
y = binarize(y, pos_class=1)
|
||||
elif dataset_name == 'cmc.2':
|
||||
y = binarize(y, pos_class=2)
|
||||
elif dataset_name == 'cmc.3':
|
||||
y = binarize(y, pos_class=3)
|
||||
|
||||
data = LabelledCollection(X, y)
|
||||
data.stats()
|
||||
raise NotImplementedError()
|
||||
#print(df)
|
||||
#print(df.loc[:, 0:5].values)
|
||||
#print(y)
|
||||
|
|
|
@ -11,6 +11,8 @@ from sklearn.calibration import CalibratedClassifierCV
|
|||
from joblib import Parallel, delayed
|
||||
from abc import abstractmethod
|
||||
from typing import Union
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# Abstract classes
|
||||
|
@ -115,8 +117,8 @@ def training_helper(learner,
|
|||
train = data
|
||||
unused = val_split
|
||||
else:
|
||||
raise ValueError('train_val_split not understood; use either a float indicating the split proportion, '
|
||||
'or a LabelledCollection indicating the validation split')
|
||||
raise ValueError('param "val_split" not understood; use either a float indicating the split '
|
||||
'proportion, or a LabelledCollection indicating the validation split')
|
||||
else:
|
||||
train, unused = data, None
|
||||
learner.fit(train.instances, train.labels)
|
||||
|
@ -159,23 +161,49 @@ class ACC(AggregativeQuantifier):
|
|||
def __init__(self, learner:BaseEstimator):
|
||||
self.learner = learner
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
|
||||
"""
|
||||
Trains a ACC quantifier
|
||||
:param data: the training set
|
||||
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
||||
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||
indicating the validation set itself
|
||||
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||
to estimate the parameters
|
||||
:return: self
|
||||
"""
|
||||
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
|
||||
if isinstance(val_split, int):
|
||||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
|
||||
y_.append(learner.predict(val_data.instances))
|
||||
y.append(val_data.labels)
|
||||
|
||||
y = np.concatenate(y)
|
||||
y_ = np.concatenate(y_)
|
||||
class_count = data.counts()
|
||||
|
||||
# fit the learner on all data
|
||||
self.learner.fit(*data.Xy)
|
||||
|
||||
else:
|
||||
self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split)
|
||||
y_ = self.learner.predict(val_data.instances)
|
||||
y = val_data.labels
|
||||
class_count = val_data.counts()
|
||||
|
||||
self.cc = CC(self.learner)
|
||||
y_ = self.classify(validation.instances)
|
||||
y = validation.labels
|
||||
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
|
||||
self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
|
||||
|
||||
return self
|
||||
|
||||
def classify(self, data):
|
||||
|
@ -216,33 +244,53 @@ class PACC(AggregativeProbabilisticQuantifier):
|
|||
def __init__(self, learner:BaseEstimator):
|
||||
self.learner = learner
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
|
||||
"""
|
||||
Trains a PACC quantifier
|
||||
:param data: the training set
|
||||
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
||||
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||
indicating the validation set itself
|
||||
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||
to estimate the parameters
|
||||
:return: self
|
||||
"""
|
||||
self.learner, validation = training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
|
||||
)
|
||||
if isinstance(val_split, int):
|
||||
# kFCV estimation of parameters
|
||||
y, y_ = [], []
|
||||
kfcv = StratifiedKFold(n_splits=val_split)
|
||||
pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
|
||||
for k, (training_idx, validation_idx) in enumerate(pbar):
|
||||
pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
|
||||
training = data.sampling_from_index(training_idx)
|
||||
validation = data.sampling_from_index(validation_idx)
|
||||
learner, val_data = training_helper(
|
||||
self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
|
||||
y_.append(learner.predict_proba(val_data.instances))
|
||||
y.append(val_data.labels)
|
||||
|
||||
y = np.concatenate(y)
|
||||
y_ = np.vstack(y_)
|
||||
|
||||
# fit the learner on all data
|
||||
self.learner.fit(*data.Xy)
|
||||
|
||||
else:
|
||||
self.learner, val_data = training_helper(
|
||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
||||
y_ = self.learner.predict_proba(val_data.instances)
|
||||
y = val_data.labels
|
||||
|
||||
self.pcc = PCC(self.learner)
|
||||
y_ = self.soft_classify(validation.instances)
|
||||
y = validation.labels
|
||||
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
confusion = np.empty(shape=(data.n_classes, data.n_classes))
|
||||
for yi in range(data.n_classes):
|
||||
confusion[yi] = y_[y==yi].mean(axis=0)
|
||||
|
||||
self.Pte_cond_estim_ = confusion.T
|
||||
|
||||
#y_ = self.classify(validation.instances)
|
||||
#y = validation.labels
|
||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||
# document that belongs to yj ends up being classified as belonging to yi
|
||||
#self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_posteriors):
|
||||
|
@ -261,7 +309,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
|||
MAX_ITER = 1000
|
||||
EPSILON = 1e-4
|
||||
|
||||
def __init__(self, learner:BaseEstimator):
|
||||
def __init__(self, learner: BaseEstimator):
|
||||
self.learner = learner
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
|
@ -307,10 +355,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
|||
estimation based on the Hellinger distance. Information Sciences, 218:146–164.
|
||||
"""
|
||||
|
||||
def __init__(self, learner:BaseEstimator):
|
||||
def __init__(self, learner: BaseEstimator):
|
||||
self.learner = learner
|
||||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3):
|
||||
"""
|
||||
Trains a HDy quantifier
|
||||
:param data: the training set
|
||||
|
@ -404,9 +452,9 @@ ClassifyAndCount = CC
|
|||
AdjustedClassifyAndCount = ACC
|
||||
ProbabilisticClassifyAndCount = PCC
|
||||
ProbabilisticAdjustedClassifyAndCount = PACC
|
||||
ExplicitLossMinimisation = ELM
|
||||
ExpectationMaximizationQuantifier = EMQ
|
||||
HellingerDistanceY = HDy
|
||||
ExplicitLossMinimisation = ELM
|
||||
|
||||
|
||||
class OneVsAll(AggregativeQuantifier):
|
||||
|
@ -436,6 +484,9 @@ class OneVsAll(AggregativeQuantifier):
|
|||
return self
|
||||
|
||||
def classify(self, instances):
|
||||
# returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry
|
||||
# (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are
|
||||
# independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
|
||||
classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
|
||||
return classif_predictions_bin.T
|
||||
|
||||
|
@ -475,10 +526,12 @@ class OneVsAll(AggregativeQuantifier):
|
|||
return self.dict_binary_quantifiers[c].classify(X)
|
||||
|
||||
def _delayed_binary_quantify(self, c, X):
|
||||
return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence
|
||||
# the estimation for the positive class prevalence
|
||||
return self.dict_binary_quantifiers[c].quantify(X)[1]
|
||||
|
||||
def _delayed_binary_aggregate(self, c, classif_predictions):
|
||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence
|
||||
# the estimation for the positive class prevalence
|
||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
||||
|
||||
def _delayed_binary_fit(self, c, data):
|
||||
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
|
||||
|
|
|
@ -55,15 +55,15 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
|
|||
save_or_show(savepath)
|
||||
|
||||
|
||||
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10,
|
||||
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
|
||||
vertical_xticks=False, savepath=None):
|
||||
from pylab import boxplot, plot, setp
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.grid()
|
||||
|
||||
bins = np.linspace(0, 1, nbins)
|
||||
binwidth = 1/(nbins - 1)
|
||||
bins = np.linspace(0, 1, nbins+1)
|
||||
binwidth = 1/nbins
|
||||
data = {}
|
||||
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||
true_prev = true_prev[:,pos_class]
|
||||
|
@ -110,7 +110,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
|||
# set_visible to False for all but the first element) after the legend has been placed
|
||||
hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
|
||||
for colorid in range(len(method_names)):
|
||||
h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k',
|
||||
h, = plot([0, 0], '-s', markerfacecolor=colormap.colors[colorid], color='k',
|
||||
mec=colormap.colors[colorid], linewidth=1.)
|
||||
hs.append(h)
|
||||
box = ax.get_position()
|
||||
|
@ -126,7 +126,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
|||
save_or_show(savepath)
|
||||
|
||||
|
||||
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True,
|
||||
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=True,
|
||||
title=f'Quantification error as a function of distribution shift',
|
||||
savepath=None):
|
||||
|
||||
|
@ -135,7 +135,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
|
|||
|
||||
x_error = qp.error.ae
|
||||
y_error = getattr(qp.error, error_name)
|
||||
ndims = tr_prevs[0].shape[-1]
|
||||
|
||||
# join all data, and keep the order in which the methods appeared for the first time
|
||||
data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
|
||||
|
@ -152,8 +151,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
|
|||
if method not in method_order:
|
||||
method_order.append(method)
|
||||
|
||||
bins = np.linspace(0, 1, n_bins)
|
||||
binwidth = 1 / (n_bins - 1)
|
||||
bins = np.linspace(0, 1, n_bins+1)
|
||||
binwidth = 1 / n_bins
|
||||
min_x, max_x = None, None
|
||||
for method in method_order:
|
||||
tr_test_drifts = data[method]['x']
|
||||
|
|
Loading…
Reference in New Issue