From 41347b50f9f9b1292dc661126dc6b04fa64bd0f2 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Mon, 11 Jan 2021 12:55:06 +0100 Subject: [PATCH] cleaning and adding some uci datasets --- quapy/data/datasets.py | 43 ++++++++++++++- quapy/method/aggregative.py | 105 +++++++++++++++++++++++++++--------- quapy/plot.py | 15 +++--- 3 files changed, 128 insertions(+), 35 deletions(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 0ef233e..b919959 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -120,7 +120,10 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom UCI_DATASETS = ['acute.a', 'acute.b', - 'balance.1', 'balance.2', 'balance.3'] + 'balance.1', 'balance.2', 'balance.3', + 'breast-cancer', + 'cmc.1', 'cmc.2', 'cmc.3', + 'ctg.1', 'ctg.2', 'ctg.3'] # ongoing... def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): @@ -136,6 +139,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): 'balance.1': 'balance-scale', 'balance.2': 'balance-scale', 'balance.3': 'balance-scale', + 'breast-cancer': 'breast-cancer-wisconsin', + 'cmc.1': 'cmc', + 'cmc.2': 'cmc', + 'cmc.3': 'cmc', + 'ctg.1': 'ctg', + 'ctg.2': 'ctg', + 'ctg.3': 'ctg', + } dataset_fullname = { @@ -144,11 +155,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): 'balance.1': 'Balance Scale Weight & Distance Database (left)', 'balance.2': 'Balance Scale Weight & Distance Database (balanced)', 'balance.3': 'Balance Scale Weight & Distance Database (right)', + 'breast-cancer': 'Breast Cancer Wisconsin (Original)', + 'cmc.1': 'Contraceptive Method Choice (no use)', + 'cmc.2': 'Contraceptive Method Choice (long term)', + 'cmc.3': 'Contraceptive Method Choice (short term)', + 'ctg.1': 'Cardiotocography Data Set (normal)', + 'ctg.2': 'Cardiotocography Data Set (suspect)', + 'ctg.3': 'Cardiotocography Data Set (pathologic)', } data_folder = { 'acute': 'diagnosis', 'balance-scale': 'balance-scale', + 'breast-cancer-wisconsin': 'breast-cancer-wisconsin', + 'cmc': 'cmc' } identifier = identifier_map[dataset_name] @@ -183,8 +203,29 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): y = binarize(df[0], pos_class='R') X = df.loc[:, 1:].astype(float).values + if identifier == 'breast-cancer-wisconsin': + df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') + Xy = df.loc[:, 1:10] + Xy[Xy=='?']=np.nan + Xy = Xy.dropna(axis=0) + X = Xy.loc[:, 1:9] + X = X.astype(float).values + y = binarize(Xy[10], pos_class=4) + + if identifier == 'cmc': + df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') + X = df.loc[:, 0:8].astype(float).values + y = df[9].astype(int).values + if dataset_name == 'cmc.1': + y = binarize(y, pos_class=1) + elif dataset_name == 'cmc.2': + y = binarize(y, pos_class=2) + elif dataset_name == 'cmc.3': + y = binarize(y, pos_class=3) + data = LabelledCollection(X, y) data.stats() + raise NotImplementedError() #print(df) #print(df.loc[:, 0:5].values) #print(y) diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 2613d8e..5a04123 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -11,6 +11,8 @@ from sklearn.calibration import CalibratedClassifierCV from joblib import Parallel, delayed from abc import abstractmethod from typing import Union +from sklearn.model_selection import StratifiedKFold +from tqdm import tqdm # Abstract classes @@ -115,8 +117,8 @@ def training_helper(learner, train = data unused = val_split else: - raise ValueError('train_val_split not understood; use either a float indicating the split proportion, ' - 'or a LabelledCollection indicating the validation split') + raise ValueError('param "val_split" not understood; use either a float indicating the split ' + 'proportion, or a LabelledCollection indicating the validation split') else: train, unused = data, None learner.fit(train.instances, train.labels) @@ -159,23 +161,49 @@ class ACC(AggregativeQuantifier): def __init__(self, learner:BaseEstimator): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): + def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3): """ Trains a ACC quantifier :param data: the training set :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param val_split: either a float in (0,1) indicating the proportion of training instances to use for validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself + indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV + to estimate the parameters :return: self """ - self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split) + if isinstance(val_split, int): + # kFCV estimation of parameters + y, y_ = [], [] + kfcv = StratifiedKFold(n_splits=val_split) + pbar = tqdm(kfcv.split(*data.Xy), total=val_split) + for k, (training_idx, validation_idx) in enumerate(pbar): + pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') + training = data.sampling_from_index(training_idx) + validation = data.sampling_from_index(validation_idx) + learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation) + y_.append(learner.predict(val_data.instances)) + y.append(val_data.labels) + + y = np.concatenate(y) + y_ = np.concatenate(y_) + class_count = data.counts() + + # fit the learner on all data + self.learner.fit(*data.Xy) + + else: + self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split) + y_ = self.learner.predict(val_data.instances) + y = val_data.labels + class_count = val_data.counts() + self.cc = CC(self.learner) - y_ = self.classify(validation.instances) - y = validation.labels + # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi - self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts() + self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count + return self def classify(self, data): @@ -216,33 +244,53 @@ class PACC(AggregativeProbabilisticQuantifier): def __init__(self, learner:BaseEstimator): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): + def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3): """ Trains a PACC quantifier :param data: the training set :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit) :param val_split: either a float in (0,1) indicating the proportion of training instances to use for validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection - indicating the validation set itself + indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV + to estimate the parameters :return: self """ - self.learner, validation = training_helper( - self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split - ) + if isinstance(val_split, int): + # kFCV estimation of parameters + y, y_ = [], [] + kfcv = StratifiedKFold(n_splits=val_split) + pbar = tqdm(kfcv.split(*data.Xy), total=val_split) + for k, (training_idx, validation_idx) in enumerate(pbar): + pbar.set_description(f'{self.__class__.__name__} fitting fold {k}') + training = data.sampling_from_index(training_idx) + validation = data.sampling_from_index(validation_idx) + learner, val_data = training_helper( + self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation) + y_.append(learner.predict_proba(val_data.instances)) + y.append(val_data.labels) + + y = np.concatenate(y) + y_ = np.vstack(y_) + + # fit the learner on all data + self.learner.fit(*data.Xy) + + else: + self.learner, val_data = training_helper( + self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) + y_ = self.learner.predict_proba(val_data.instances) + y = val_data.labels + self.pcc = PCC(self.learner) - y_ = self.soft_classify(validation.instances) - y = validation.labels + + # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a + # document that belongs to yj ends up being classified as belonging to yi confusion = np.empty(shape=(data.n_classes, data.n_classes)) for yi in range(data.n_classes): confusion[yi] = y_[y==yi].mean(axis=0) self.Pte_cond_estim_ = confusion.T - #y_ = self.classify(validation.instances) - #y = validation.labels - # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a - # document that belongs to yj ends up being classified as belonging to yi - #self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts() return self def aggregate(self, classif_posteriors): @@ -261,7 +309,7 @@ class EMQ(AggregativeProbabilisticQuantifier): MAX_ITER = 1000 EPSILON = 1e-4 - def __init__(self, learner:BaseEstimator): + def __init__(self, learner: BaseEstimator): self.learner = learner def fit(self, data: LabelledCollection, fit_learner=True): @@ -307,10 +355,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier): estimation based on the Hellinger distance. Information Sciences, 218:146–164. """ - def __init__(self, learner:BaseEstimator): + def __init__(self, learner: BaseEstimator): self.learner = learner - def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3): + def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3): """ Trains a HDy quantifier :param data: the training set @@ -404,9 +452,9 @@ ClassifyAndCount = CC AdjustedClassifyAndCount = ACC ProbabilisticClassifyAndCount = PCC ProbabilisticAdjustedClassifyAndCount = PACC -ExplicitLossMinimisation = ELM ExpectationMaximizationQuantifier = EMQ HellingerDistanceY = HDy +ExplicitLossMinimisation = ELM class OneVsAll(AggregativeQuantifier): @@ -436,6 +484,9 @@ class OneVsAll(AggregativeQuantifier): return self def classify(self, instances): + # returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry + # (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are + # independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes. classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances) return classif_predictions_bin.T @@ -475,10 +526,12 @@ class OneVsAll(AggregativeQuantifier): return self.dict_binary_quantifiers[c].classify(X) def _delayed_binary_quantify(self, c, X): - return self.dict_binary_quantifiers[c].quantify(X)[1] # the estimation for the positive class prevalence + # the estimation for the positive class prevalence + return self.dict_binary_quantifiers[c].quantify(X)[1] def _delayed_binary_aggregate(self, c, classif_predictions): - return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1] # the estimation for the positive class prevalence + # the estimation for the positive class prevalence + return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1] def _delayed_binary_fit(self, c, data): bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2) diff --git a/quapy/plot.py b/quapy/plot.py index 2b9375b..5164a59 100644 --- a/quapy/plot.py +++ b/quapy/plot.py @@ -55,15 +55,15 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title save_or_show(savepath) -def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10, +def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10, vertical_xticks=False, savepath=None): from pylab import boxplot, plot, setp fig, ax = plt.subplots() ax.grid() - bins = np.linspace(0, 1, nbins) - binwidth = 1/(nbins - 1) + bins = np.linspace(0, 1, nbins+1) + binwidth = 1/nbins data = {} for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs): true_prev = true_prev[:,pos_class] @@ -110,7 +110,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N # set_visible to False for all but the first element) after the legend has been placed hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]] for colorid in range(len(method_names)): - h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k', + h, = plot([0, 0], '-s', markerfacecolor=colormap.colors[colorid], color='k', mec=colormap.colors[colorid], linewidth=1.) hs.append(h) box = ax.get_position() @@ -126,7 +126,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N save_or_show(savepath) -def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True, +def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=True, title=f'Quantification error as a function of distribution shift', savepath=None): @@ -135,7 +135,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e x_error = qp.error.ae y_error = getattr(qp.error, error_name) - ndims = tr_prevs[0].shape[-1] # join all data, and keep the order in which the methods appeared for the first time data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))}) @@ -152,8 +151,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e if method not in method_order: method_order.append(method) - bins = np.linspace(0, 1, n_bins) - binwidth = 1 / (n_bins - 1) + bins = np.linspace(0, 1, n_bins+1) + binwidth = 1 / n_bins min_x, max_x = None, None for method in method_order: tr_test_drifts = data[method]['x']