cleaning branch

2024-02-23 16:55:14 +01:00 · 2024-02-23 16:55:14 +01:00 · caa7fd2884
parent b3ccf71edb
commit caa7fd2884
3 changed files with 643 additions and 0 deletions
--- a/ClassifierAccuracy/accuracy_prediction_via_quantification.py
+++ b/ClassifierAccuracy/accuracy_prediction_via_quantification.py
@ -0,0 +1,315 @@
 import numpy as np
 import scipy.special
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import f1_score
 from sklearn.svm import LinearSVC
 import quapy as qp
 from model_selection import GridSearchQ
 from quapy.protocol import APP
 from quapy.method.aggregative import PACC, ACC, EMQ, PCC, CC, DMy, T50, MS2, KDEyML, KDEyCS, KDEyHD
 from sklearn import clone
 import quapy.functional as F
 # datasets = qp.datasets.UCI_DATASETS
 datasets = ['imdb']
 # target = 'f1'
 target = 'acc'
 errors = []
 def method_1(cls, q, train, val, sample, y=None, y_hat=None):
    """
    Converts a misclassification matrix computed in validation (i.e., in the train distribution P) into
    the corresponding equivalent misclassification matrix in test (i.e., in the test distribution Q)
    by relying on the PPS assumptions.
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    # q = EMQ(LogisticRegression(class_weight='balanced'))
    # q.fit(val, fit_classifier=True)
    # q = EMQ(cls)
    # q.fit(train, fit_classifier=False)
    # q = KDEyML(cls)
    # q.fit(train, val_split=val, fit_classifier=False)
    M_hat = ACC.getPteCondEstim(train.classes_, y_val, y_hat_val)
    M_true = ACC.getPteCondEstim(train.classes_, y, y_hat)
    p_hat = q.quantify(sample.instances)
    cont_table_hat = p_hat * M_hat
    # cont_table_hat = np.clip(cont_table_hat, 0, 1)
    # cont_table_hat = cont_table_hat / cont_table_hat.sum()
    print('true_prev: ', sample.prevalence())
    print('estim_prev: ', p_hat)
    print('M-true:\n', M_true)
    print('M-hat:\n', M_hat)
    print('cont_table:\n', cont_table_hat)
    tp = cont_table_hat[1, 1]
    tn = cont_table_hat[0, 0]
    fn = cont_table_hat[0, 1]
    fp = cont_table_hat[1, 0]
    return tn, fn, fp, tp
 def method_2(cls, train, val, sample, y=None, y_hat=None):
    """
    Assume P and Q are the training and test distributions
    Solves the following system of linear equations:
    tp + fp = CC (the classify & count estimate, observed)
    fn + tp = Q(Y=1) (this is not observed but is estimated via quantification)
    tp + fp + fn + tn = 1 (trivial)
    There are 4 unknowns and 3 equations. The fourth required one is established
    by assuming that the PPS conditions hold, i.e., that P(X|Y)=Q(X|Y); note that
    this implies P(hatY|Y)=Q(hatY|Y) if hatY is computed by any measurable function.
    In particular, we consider that the tpr in P (estimated via validation, hereafter tpr) and
    in Q (unknown, hereafter tpr_Q) should
    be the same. This means:
    tpr = tpr_Q = tp / (tp + fn)
    after some manipulation:
    tp (tpr-1) + fn (tpr) = 0 <-- our last equation
    Note that the last equation relies on the estimate tpr. It is likely that, the more
    positives we have, the more reliable this estimate is. This suggests that, in cases
    in which we have more negatives in the validation set than positives, it might be
    convenient to resort to the true negative rate (tnr) instead. This gives rise to
    the alternative fourth equation:
    tn (tnr-1) + fp (tnr) = 0
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    q = ACC(cls)
    q.fit(train, val_split=val, fit_classifier=False)
    p_hat = q.quantify(sample.instances)
    pos_prev = p_hat[1]
    # pos_prev = sample.prevalence()[1]
    cc = CC(cls)
    cc.fit(train, fit_classifier=False)
    cc_prev = cc.quantify(sample.instances)[1]
    M_hat = ACC.getPteCondEstim(train.classes_, y_val, y_hat_val)
    M_true = ACC.getPteCondEstim(train.classes_, y, y_hat)
    cont_table_true = sample.prevalence() * M_true
    if val.prevalence()[1] > 0.5:
        # in this case, the tpr might be a more reliable estimate than tnr
        tpr_hat = M_hat[1, 1]
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [0, tpr_hat, 0, tpr_hat - 1]
        ])
    else:
        # in this case, the tnr might be a more reliable estimate than tpr
        tnr_hat = M_hat[0, 0]
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [tnr_hat-1, 0, tnr_hat, 0]
        ])
    b = np.asarray(
        [cc_prev, pos_prev, 1, 0]
    )
    tn, fn, fp, tp = np.linalg.solve(A, b)
    cont_table_estim = np.asarray([
        [tn, fn],
        [fp, tp]
    ])
    # if (cont_table_estim < 0).any() or (cont_table_estim>1).any():
    #     cont_table_estim = scipy.special.softmax(cont_table_estim)
    print('true_prev: ', sample.prevalence())
    print('estim_prev: ', p_hat)
    print('true_cont_table:\n', cont_table_true)
    print('estim_cont_table:\n', cont_table_estim)
    # print('true_tpr', M_true[1,1])
    # print('estim_tpr', tpr_hat)
    return tn, fn, fp, tp
 def method_3(cls, train, val, sample, y=None, y_hat=None):
    """
    This is just method 2 but without involving any quapy's quantifier.
    :return: tuple (tn, fn, fp, tp,) of floats in [0,1] summing up to 1
    """
    classes = val.classes_
    y_val = val.labels
    y_hat_val = cls.predict(val.instances)
    M_hat = ACC.getPteCondEstim(classes, y_val, y_hat_val)
    y_hat_test = cls.predict(sample.instances)
    pos_prev_cc = F.prevalence_from_labels(y_hat_test, classes)[1]
    tpr_hat = M_hat[1,1]
    fpr_hat = M_hat[1,0]
    tnr_hat = M_hat[0,0]
    if tpr_hat!=fpr_hat:
        pos_prev_test_hat = (pos_prev_cc - fpr_hat) / (tpr_hat - fpr_hat)
    else:
        print('-->', tpr_hat)
        pos_prev_test_hat = pos_prev_cc
    pos_prev_test_hat = np.clip(pos_prev_test_hat, 0, 1)
    pos_prev_val = val.prevalence()[1]
    if pos_prev_val > 0.5:
        # in this case, the tpr might be a more reliable estimate than tnr
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [0, tpr_hat, 0, tpr_hat - 1]
        ])
    else:
        # in this case, the tnr might be a more reliable estimate than tpr
        A = np.asarray([
            [0, 0, 1, 1],
            [0, 1, 0, 1],
            [1, 1, 1, 1],
            [tnr_hat-1, 0, tnr_hat, 0]
        ])
    b = np.asarray(
        [pos_prev_cc, pos_prev_test_hat, 1, 0]
    )
    tn, fn, fp, tp = np.linalg.solve(A, b)
    return tn, fn, fp, tp
 def cls_eval_from_counters(tn, fn, fp, tp):
    if target == 'acc':
        acc_hat = (tp + tn)
    else:
        den = (2 * tp + fn + fp)
        if den > 0:
            acc_hat = 2 * tp / den
        else:
            acc_hat = 0
    return acc_hat
 def cls_eval_from_labels(y, y_hat):
    if target == 'acc':
        acc = (y_hat == y).mean()
    else:
        acc = f1_score(y, y_hat, zero_division=0)
    return acc
 for dataset_name in datasets:
    train_orig, test = qp.datasets.fetch_reviews(dataset_name, tfidf=True, min_df=10).train_test
    xs = []
    ys_1 = []
    ys_trval = []
    ys_3 = []
    train_prot = APP(train_orig, n_prevalences=11, repeats=1, return_type='labelled_collection', random_state=0, sample_size=10000)
    for train in train_prot():
        if np.product(train.prevalence()) == 0:
            # skip experiments with no positives or no negatives in training
            continue
        cls = LogisticRegression(class_weight='balanced', C=100)
        # cls = CalibratedClassifierCV(LinearSVC())
        train, val = train.split_stratified(train_prop=0.5, random_state=0)
        print(f'dataset name = {dataset_name}')
        print(f'#train = {len(train)}, prev={F.strprev(train.prevalence())}')
        print(f'#val = {len(val)}, prev={F.strprev(val.prevalence())}')
        print(f'#test = {len(test)}, prev={F.strprev(test.prevalence())}')
        cls.fit(*train.Xy)
        # q = KDEyML(cls)
        q = ACC(LogisticRegression())
        q.fit(train, val_split=val, fit_classifier=True)
        # q = GridSearchQ(PACC(cls),
        #                 param_grid={'classifier__C':np.logspace(-2,2,5)},
        #                 protocol=APP(val, sample_size=1000),
        #                 verbose=True,
        #                 n_jobs=-1).fit(train)
        acc_trval = cls_eval_from_labels(val.labels, cls.predict(val.instances))
        for sample in APP(test, n_prevalences=21, repeats=1, sample_size=1000, return_type='labelled_collection')():
            print('='*80)
            y_hat = cls.predict(sample.instances)
            y = sample.labels
            acc_true = cls_eval_from_labels(y, y_hat)
            xs.append(acc_true)
            ys_trval.append(acc_trval)
            tn, fn, fp, tp = method_1(cls, q, train, val, sample, y, y_hat)
            acc_hat = cls_eval_from_counters(tn, fn, fp, tp)
            ys_1.append(acc_hat)
            tn, fn, fp, tp = method_3(cls, train, val, sample, y, y_hat)
            acc_hat = cls_eval_from_counters(tn, fn, fp, tp)
            ys_3.append(acc_hat)
            error = abs(acc_true - acc_hat)
            errors.append(error)
            print(f'classifier accuracy={acc_true:.3f}')
            print(f'estimated accuracy={acc_hat:.3f}')
            print(f'estimation error={error:.4f}')
 print('process end')
 print('='*80)
 print(f'mean error = {np.mean(errors)}')
 print(f'std error = {np.std(errors)}')
 import matplotlib.pyplot as plt
 # Create scatter plot
 plt.plot([0, 1], [0, 1], color='black', linestyle='--')
 plt.scatter(xs, ys_1, label='method 1')
 plt.scatter(xs, ys_3, label='method 3')
 plt.scatter(xs, ys_trval, label='tr-val')
 plt.legend()
 # Add labels and title
 plt.xlabel('True Accuracy')
 plt.ylabel('Estim Accuracy')
 # Display the plot
 plt.show()
--- a/ClassifierAccuracy/main.py
+++ b/ClassifierAccuracy/main.py
@ -0,0 +1,149 @@
 from collections import defaultdict
 from sklearn.calibration import CalibratedClassifierCV
 from sklearn.svm import LinearSVC
 from tqdm import tqdm
 from sklearn.linear_model import LogisticRegression
 import os
 import quapy as qp
 from method.aggregative import PACC, EMQ, PCC, CC, ACC, HDy
 from models import *
 import matplotlib.pyplot as plt
 from pathlib import Path
 def clf():
    # return CalibratedClassifierCV(LinearSVC(class_weight=None))
    return LogisticRegression(class_weight=None)
 def F1(contingency_table):
    # tn = contingency_table[0, 0]
    tp = contingency_table[1, 1]
    fp = contingency_table[0, 1]
    fn = contingency_table[1, 0]
    den = (2*tp+fp+fn)
    if den>0:
        return 2*tp/den
    else:
        return 1
 def accuracy(contingency_table):
    tn = contingency_table[0, 0]
    tp = contingency_table[1, 1]
    fp = contingency_table[0, 1]
    fn = contingency_table[1, 0]
    return (tp+tn)/(tp+fp+fn+tn)
 def plot_series(series, repeats, metric_name, train_prev=None, savepath=None):
    for key in series:
        print(series[key])
    fig, ax = plt.subplots()
    def bin(v):
        mat = np.asarray(v).reshape(-1, repeats)
        return mat.mean(axis=1), mat.std(axis=1)
    x = series['prev']
    x,_ = bin(x)
    for serie in series:
        if serie=='prev': continue
        values = series[serie]
        print(serie, values)
        val_mean, val_std = bin(values)
        ax.errorbar(x, val_mean, label=serie, fmt='-', marker='o')
        ax.fill_between(x, val_mean - val_std, val_mean + val_std, alpha=0.25)
    if train_prev is not None:
        ax.axvline(x=train_prev, label='tr-prev', color='k', linestyle='--')
        # ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    ax.grid()
    ax.set_title(metric_name)
    ax.set(xlabel='$p_U(\oplus)$', ylabel='estimated '+metric_name,
           title='Classifier accuracy in terms of '+metric_name)
    if savepath is None:
        plt.show()
    else:
        os.makedirs(Path(savepath).parent, exist_ok=True)
        plt.savefig(savepath, bbox_inches='tight')
 dataset='imdb'
 data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=5, pickle=True)
 # qp.data.preprocessing.reduce_columns(data, min_df=5, inplace=True)
 # print('num_features', data.training.instances.shape[1])
 train = data.training
 test = data.test
 upper = UpperBound(clf(), y_test=None).fit(train)
 mlcfe = MLCMEstimator(clf(), strategy='kfcv', k=5, n_jobs=-1).fit(train)
 emq_quant = QuantificationCMPredictor(clf(), EMQ(LogisticRegression()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
 # cc_quant = QuantificationCMPredictor(clf(), CC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
 # pcc_quant = QuantificationCMPredictor(clf(), PCC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
 # acc_quant = QuantificationCMPredictor(clf(), ACC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
 pacc_quant = QuantificationCMPredictor(clf(), PACC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
 # hdy_quant = QuantificationCMPredictor(clf(), HDy(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
 sld = EMQ(LogisticRegression()).fit(train)
 pacc = PACC(clf()).fit(train)
 contenders = [
    ('kFCV+MLPE', mlcfe),
    ('SLD', emq_quant),
    # ('CC', cc_quant),
    # ('PCC', pcc_quant),
    # ('ACC', acc_quant),
    ('PACC', pacc_quant),
    # ('HDy', hdy_quant)
 ]
 metric = F1
 # metric = accuracy
 repeats = 10
 with qp.util.temp_seed(42):
    samples_idx = [idx for idx in test.artificial_sampling_index_generator(sample_size=500, n_prevalences=21, repeats=repeats)]
 series = defaultdict(lambda: [])
 for idx in tqdm(samples_idx, desc='generating predictions'):
    sample = test.sampling_from_index(idx)
    upper.show_true_labels(sample.labels)
    upper_conf_matrix = upper.predict(sample.instances)
    metric_true = metric(upper_conf_matrix)
    series['Upper'].append(metric_true)
    for mname, method in contenders:
        conf_matrix = method.predict(sample.instances)
        estim_metric = metric(conf_matrix)
        series[mname].append(estim_metric)
        if hasattr(method, 'quantify'):
            series[mname+'-prev'].append(method.quantify(sample.instances))
    series['binsld-prev'].append(sld.quantify(sample.instances)[1])
    series['binpacc-prev'].append(pacc.quantify(sample.instances)[1])
    series['optimal-prev'].append(sample.prevalence()[1])
    series['prev'].append(sample.prevalence()[1])
 metricname = metric.__name__
 plot_series(series, repeats, metric_name=metricname, train_prev=train.prevalence()[1], savepath='./plots/'+dataset+'_LinearSVC_'+metricname+'.pdf')
--- a/ClassifierAccuracy/models.py
+++ b/ClassifierAccuracy/models.py
@ -0,0 +1,179 @@
 import numpy as np
 import quapy as qp
 from sklearn import clone
 from sklearn.metrics import confusion_matrix
 import scipy
 from scipy.sparse import issparse, csr_matrix
 from data import LabelledCollection
 from abc import ABC, abstractmethod
 from sklearn.model_selection import cross_val_predict
 class ConfusionMatrixPredictor(ABC):
    """
    Abstract class of predictors of a confusion matrix for the performance of a classifier.
    For the binary case, this accounts to predicting the 4-cell contingency table consisting of the
    true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN) that
    most evaluation metrics make use of.
    """
    @abstractmethod
    def fit(self, train: LabelledCollection):
        pass
    @abstractmethod
    def predict(self, test):
        pass
 class MLCMEstimator(ConfusionMatrixPredictor):
    """
    The Maximum Likelihood Confusion Matrix Estimator is a method that relies on the IID assumption, and thus
    computes, via k-FCV (or any other technique) the counters of the confusion matrix, assuming that those are
    good estimates for the test case.
    """
    def __init__(self, classifier, strategy='kfcv', **kwargs):
        assert strategy in ['kfcv'], 'unknown strategy'
        if strategy=='kfcv':
            assert 'k' in kwargs, 'strategy "kfcv" requires "k" to be passed as an argument'
        self.classifier = classifier
        self.strategy = strategy
        self.kwargs = kwargs
    def sout(self, msg):
        if 'verbose' in self.kwargs:
            print(msg)
    def fit(self, train: LabelledCollection):
        X, y = train.Xy
        if self.strategy == 'kfcv':
            k=self.kwargs['k']
            n_jobs = self.kwargs['n_jobs'] if 'n_jobs' in self.kwargs else 1
            predict = self.kwargs['predict'] if 'predict' in self.kwargs else 'predict'
            self.sout(f'{self.__class__.__name__}: '
                      f'running cross_val_predict with k={k} n_jobs={n_jobs} predict={predict}')
            predictions = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method=predict)
            self.conf_matrix = confusion_matrix(y, predictions, labels=train.classes_)
        return self
    def predict(self, test):
        """
        This method disregards the test set, under the assumption that it is IID wrt the training. This meaning that
        the confusion matrix for the test data should coincide with the one computed for training (using any cross
        validation strategy).
        :param test: test collection (ignored)
        :return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
        """
        return self.conf_matrix
 class UpperBound(ConfusionMatrixPredictor):
    def __init__(self, classifier, y_test):
        self.classifier = classifier
        self.y_test = y_test
    def fit(self, train: LabelledCollection):
        self.classifier.fit(*train.Xy)
        self.classes = train.classes_
        return self
    def show_true_labels(self, y_test):
        self.y_test = y_test
    def predict(self, test):
        predictions = self.classifier.predict(test)
        return confusion_matrix(self.y_test, predictions, labels=self.classes)
 def get_counters(y_true, y_pred):
    counters = np.full(shape=y_true.shape, fill_value=-1)
    counters[np.logical_and(y_true == 1, y_pred == 1)] = 0
    counters[np.logical_and(y_true == 1, y_pred == 0)] = 1
    counters[np.logical_and(y_true == 0, y_pred == 1)] = 2
    counters[np.logical_and(y_true == 0, y_pred == 0)] = 3
    class_map = {
        0:'tp',
        1:'fn',
        2:'fp',
        3:'tn'
    }
    return counters, class_map
 def safehstack(matrix, posteriors):
    if issparse(matrix):
        instances = csr_matrix(scipy.sparse.hstack([matrix, posteriors]))
    else:
        instances = np.hstack([matrix, posteriors])
    return instances
 class QuantificationCMPredictor(ConfusionMatrixPredictor):
    """
    """
    def __init__(self, classifier, quantifier, strategy='kfcv', **kwargs):
        assert strategy in ['kfcv'], 'unknown strategy'
        if strategy=='kfcv':
            assert 'k' in kwargs, 'strategy "kfcv" requires "k" to be passed as an argument'
        self.classifier = clone(classifier)
        self.quantifier = quantifier
        self.strategy = strategy
        self.kwargs = kwargs
    def sout(self, msg):
        if 'verbose' in self.kwargs:
            print(msg)
    def fit(self, train: LabelledCollection):
        X, y = train.Xy
        if self.strategy == 'kfcv':
            k=self.kwargs['k']
            n_jobs = self.kwargs['n_jobs'] if 'n_jobs' in self.kwargs else 1
            self.sout(f'{self.__class__.__name__}: '
                      f'running cross_val_predict with k={k} n_jobs={n_jobs}')
            predictions = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method='predict')
            posteriors  = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method='predict_proba')
            self.classifier.fit(X, y)
            instances = safehstack(train.instances, posteriors)
            counters, class_map = get_counters(train.labels, predictions)
            q_data = LabelledCollection(instances=instances, labels=counters, classes_=[0,1,2,3])
            print('counters prevalence', q_data.counts())
            self.quantifier.fit(q_data)
        return self
    def predict(self, test):
        """
        :param test: test collection (ignored)
        :return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
        """
        posteriors = self.classifier.predict_proba(test)
        instances = safehstack(test, posteriors)
        counters = self.quantifier.quantify(instances)
        tp, fn, fp, tn = counters
        conf_matrix = np.asarray([[tn, fp], [fn, tp]])
        return conf_matrix
    def quantify(self, test):
        posteriors = self.classifier.predict_proba(test)
        instances = safehstack(test, posteriors)
        counters = self.quantifier.quantify(instances)
        tp, fn, fp, tn = counters
        den_tpr = (tp+fn)
        if den_tpr>0:
            tpr = tp/den_tpr
        else:
            tpr = 1
        den_fpr = (fp+tn)
        if den_fpr>0:
            fpr = fp / den_fpr
        else:
            fpr = 0
        pcc = posteriors.sum(axis=0)[1]
        pacc = (pcc-fpr)/(tpr-fpr)
        pacc = np.clip(pacc, 0, 1)
        q = tp+fn
        return q