QuaPy/ClassifierAccuracy/commons.py

from collections import defaultdict

from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import numpy as np
from time import time
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC

from method.aggregative import PACC, EMQ, ACC
from utils import *

import quapy.data.datasets
import quapy as qp
from models_multiclass import *
from quapy.data import LabelledCollection
from quapy.protocol import UPP
from quapy.data.datasets import fetch_UCIMulticlassLabelledCollection, UCI_MULTICLASS_DATASETS


def split(data: LabelledCollection):
    train_val, test = data.split_stratified(train_prop=0.66, random_state=0)
    train, val = train_val.split_stratified(train_prop=0.5, random_state=0)
    return train, val, test


def gen_classifiers():
    yield 'LR', LogisticRegression()
    #yield 'NB', GaussianNB()
    #yield 'SVM(rbf)', SVC()
    #yield 'SVM(linear)', LinearSVC()


def gen_datasets()-> [str,[LabelledCollection,LabelledCollection,LabelledCollection]]:
    for dataset_name in UCI_MULTICLASS_DATASETS:
        dataset = fetch_UCIMulticlassLabelledCollection(dataset_name)
        yield dataset_name, split(dataset)


def gen_CAP(h, acc_fn)->[str, ClassifierAccuracyPrediction]:
    yield 'SebCAP', SebastianiCAP(h, acc_fn, ACC)
    yield 'SebCAPweight', SebastianiCAP(h, acc_fn, ACC, alpha=0)
    yield 'PabCAP', PabloCAP(h, acc_fn, ACC)
    yield 'PabCAP-SLD-median', PabloCAP(h, acc_fn, EMQ, aggr='median')

def gen_CAP_cont_table(h)->[str,CAPContingencyTable]:
    acc_fn = None
    # yield 'Naive', NaiveCAP(h, acc_fn)
    yield 'CT-PPS-EMQ', ContTableTransferCAP(h, acc_fn, EMQ(LogisticRegression()))
    #yield 'CT-PPSh-ACC', ContTableWithHTransferCAP(h, acc_fn, ACC)
    yield 'Equations-ACCh', NsquaredEquationsCAP(h, acc_fn, ACC, reuse_h=True)
    # yield 'Equations-ACC', NsquaredEquationsCAP(h, acc_fn, ACC)
    yield 'Equations-SLD', NsquaredEquationsCAP(h, acc_fn, EMQ)

def gen_acc_measure():
    yield 'vanilla_accuracy', vanilla_acc_fn
    yield 'macro-F1', macrof1


def true_acc(h:BaseEstimator, acc_fn: callable, U: LabelledCollection):
    y_pred = h.predict(U.X)
    y_true = U.y
    conf_table = confusion_matrix(y_true, y_pred=y_pred, labels=U.classes_)
    return acc_fn(conf_table)


def vanilla_acc_fn(cont_table):
    return np.diag(cont_table).sum() / cont_table.sum()


def _f1_bin(tp, fp, fn):
    if tp + fp + fn == 0:
        return 1
    else:
        return (2 * tp) / (2 * tp + fp + fn)


def macrof1(cont_table):
    n = cont_table.shape[0]

    if n==2:
        tp = cont_table[1,1]
        fp = cont_table[0,1]
        fn = cont_table[1,0]
        return _f1_bin(tp, fp, fn)

    f1_per_class = []
    for i in range(n):
        tp = cont_table[i,i]
        fp = cont_table[:,i].sum() - tp
        fn = cont_table[i,:].sum() - tp
        f1_per_class.append(_f1_bin(tp, fp, fn))
    return np.mean(f1_per_class)


def microf1(cont_table):
    n = cont_table.shape[0]

    if n == 2:
        tp = cont_table[1, 1]
        fp = cont_table[0, 1]
        fn = cont_table[1, 0]
        return _f1_bin(tp, fp, fn)

    tp, fp, fn = 0, 0, 0
    for i in range(n):
        tp += cont_table[i, i]
        fp += cont_table[:, i] - tp
        fn += cont_table[i, :] - tp
    return _f1_bin(tp, fp, fn)


def cap_errors(true_acc, estim_acc):
    true_acc = np.asarray(true_acc)
    estim_acc = np.asarray(estim_acc)
    #return (true_acc - estim_acc)**2
    return np.abs(true_acc - estim_acc)