some good refactoring
This commit is contained in:
parent
caa7fd2884
commit
9d64d18cd4
|
@ -0,0 +1,149 @@
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from tqdm import tqdm
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import os
|
||||||
|
import quapy as qp
|
||||||
|
from method.aggregative import PACC, EMQ, PCC, CC, ACC, HDy
|
||||||
|
from models_binary import *
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def clf():
|
||||||
|
# return CalibratedClassifierCV(LinearSVC(class_weight=None))
|
||||||
|
return LogisticRegression(class_weight=None)
|
||||||
|
|
||||||
|
|
||||||
|
def F1(contingency_table):
|
||||||
|
# tn = contingency_table[0, 0]
|
||||||
|
tp = contingency_table[1, 1]
|
||||||
|
fp = contingency_table[0, 1]
|
||||||
|
fn = contingency_table[1, 0]
|
||||||
|
den = (2*tp+fp+fn)
|
||||||
|
if den>0:
|
||||||
|
return 2*tp/den
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
def accuracy(contingency_table):
|
||||||
|
tn = contingency_table[0, 0]
|
||||||
|
tp = contingency_table[1, 1]
|
||||||
|
fp = contingency_table[0, 1]
|
||||||
|
fn = contingency_table[1, 0]
|
||||||
|
return (tp+tn)/(tp+fp+fn+tn)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_series(series, repeats, metric_name, train_prev=None, savepath=None):
|
||||||
|
|
||||||
|
for key in series:
|
||||||
|
print(series[key])
|
||||||
|
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
|
||||||
|
def bin(v):
|
||||||
|
mat = np.asarray(v).reshape(-1, repeats)
|
||||||
|
return mat.mean(axis=1), mat.std(axis=1)
|
||||||
|
|
||||||
|
x = series['prev']
|
||||||
|
x,_ = bin(x)
|
||||||
|
|
||||||
|
for serie in series:
|
||||||
|
if serie=='prev': continue
|
||||||
|
values = series[serie]
|
||||||
|
print(serie, values)
|
||||||
|
val_mean, val_std = bin(values)
|
||||||
|
ax.errorbar(x, val_mean, label=serie, fmt='-', marker='o')
|
||||||
|
ax.fill_between(x, val_mean - val_std, val_mean + val_std, alpha=0.25)
|
||||||
|
|
||||||
|
if train_prev is not None:
|
||||||
|
ax.axvline(x=train_prev, label='tr-prev', color='k', linestyle='--')
|
||||||
|
# ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)
|
||||||
|
|
||||||
|
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||||
|
|
||||||
|
ax.grid()
|
||||||
|
ax.set_title(metric_name)
|
||||||
|
ax.set(xlabel='$p_U(\oplus)$', ylabel='estimated '+metric_name,
|
||||||
|
title='Classifier accuracy in terms of '+metric_name)
|
||||||
|
|
||||||
|
if savepath is None:
|
||||||
|
plt.show()
|
||||||
|
else:
|
||||||
|
os.makedirs(Path(savepath).parent, exist_ok=True)
|
||||||
|
plt.savefig(savepath, bbox_inches='tight')
|
||||||
|
|
||||||
|
|
||||||
|
dataset='imdb'
|
||||||
|
data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=5, pickle=True)
|
||||||
|
|
||||||
|
# qp.data.preprocessing.reduce_columns(data, min_df=5, inplace=True)
|
||||||
|
# print('num_features', data.training.instances.shape[1])
|
||||||
|
|
||||||
|
train = data.training
|
||||||
|
test = data.test
|
||||||
|
|
||||||
|
upper = UpperBound(clf(), y_test=None).fit(train)
|
||||||
|
|
||||||
|
mlcfe = MLCMEstimator(clf(), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
||||||
|
|
||||||
|
emq_quant = QuantificationCMPredictor(clf(), EMQ(LogisticRegression()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
||||||
|
# cc_quant = QuantificationCMPredictor(clf(), CC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
||||||
|
# pcc_quant = QuantificationCMPredictor(clf(), PCC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
||||||
|
# acc_quant = QuantificationCMPredictor(clf(), ACC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
||||||
|
pacc_quant = QuantificationCMPredictor(clf(), PACC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
||||||
|
# hdy_quant = QuantificationCMPredictor(clf(), HDy(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
||||||
|
|
||||||
|
sld = EMQ(LogisticRegression()).fit(train)
|
||||||
|
pacc = PACC(clf()).fit(train)
|
||||||
|
|
||||||
|
contenders = [
|
||||||
|
('kFCV+MLPE', mlcfe),
|
||||||
|
('SLD', emq_quant),
|
||||||
|
# ('CC', cc_quant),
|
||||||
|
# ('PCC', pcc_quant),
|
||||||
|
# ('ACC', acc_quant),
|
||||||
|
('PACC', pacc_quant),
|
||||||
|
# ('HDy', hdy_quant)
|
||||||
|
]
|
||||||
|
|
||||||
|
metric = F1
|
||||||
|
# metric = accuracy
|
||||||
|
|
||||||
|
repeats = 10
|
||||||
|
with qp.util.temp_seed(42):
|
||||||
|
samples_idx = [idx for idx in test.artificial_sampling_index_generator(sample_size=500, n_prevalences=21, repeats=repeats)]
|
||||||
|
|
||||||
|
|
||||||
|
series = defaultdict(lambda: [])
|
||||||
|
for idx in tqdm(samples_idx, desc='generating predictions'):
|
||||||
|
sample = test.sampling_from_index(idx)
|
||||||
|
|
||||||
|
upper.show_true_labels(sample.labels)
|
||||||
|
upper_conf_matrix = upper.predict(sample.instances)
|
||||||
|
metric_true = metric(upper_conf_matrix)
|
||||||
|
series['Upper'].append(metric_true)
|
||||||
|
|
||||||
|
for mname, method in contenders:
|
||||||
|
conf_matrix = method.predict(sample.instances)
|
||||||
|
estim_metric = metric(conf_matrix)
|
||||||
|
series[mname].append(estim_metric)
|
||||||
|
if hasattr(method, 'quantify'):
|
||||||
|
series[mname+'-prev'].append(method.quantify(sample.instances))
|
||||||
|
|
||||||
|
series['binsld-prev'].append(sld.quantify(sample.instances)[1])
|
||||||
|
series['binpacc-prev'].append(pacc.quantify(sample.instances)[1])
|
||||||
|
series['optimal-prev'].append(sample.prevalence()[1])
|
||||||
|
series['prev'].append(sample.prevalence()[1])
|
||||||
|
|
||||||
|
metricname = metric.__name__
|
||||||
|
plot_series(series, repeats, metric_name=metricname, train_prev=train.prevalence()[1], savepath='./plots/'+dataset+'_LinearSVC_'+metricname+'.pdf')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,148 +1,75 @@
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
from sklearn.base import BaseEstimator
|
||||||
from sklearn.svm import LinearSVC
|
|
||||||
from tqdm import tqdm
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
import os
|
import numpy as np
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
|
||||||
|
from method.aggregative import PACC, EMQ
|
||||||
|
from utils import *
|
||||||
|
|
||||||
|
import quapy.data.datasets
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from method.aggregative import PACC, EMQ, PCC, CC, ACC, HDy
|
from models_multiclass import *
|
||||||
from models import *
|
from quapy.data import LabelledCollection
|
||||||
import matplotlib.pyplot as plt
|
from quapy.protocol import UPP
|
||||||
from pathlib import Path
|
from quapy.data.datasets import fetch_UCIMulticlassLabelledCollection, UCI_MULTICLASS_DATASETS
|
||||||
|
|
||||||
|
|
||||||
def clf():
|
def split(data: LabelledCollection):
|
||||||
# return CalibratedClassifierCV(LinearSVC(class_weight=None))
|
train_val, test = data.split_stratified(train_prop=0.66)
|
||||||
return LogisticRegression(class_weight=None)
|
train, val = train_val.split_stratified(train_prop=0.5)
|
||||||
|
return train, val, test
|
||||||
|
|
||||||
|
|
||||||
def F1(contingency_table):
|
def gen_datasets()-> [str,[LabelledCollection,LabelledCollection,LabelledCollection]]:
|
||||||
# tn = contingency_table[0, 0]
|
for dataset_name in UCI_MULTICLASS_DATASETS:
|
||||||
tp = contingency_table[1, 1]
|
dataset = fetch_UCIMulticlassLabelledCollection(dataset_name)
|
||||||
fp = contingency_table[0, 1]
|
yield dataset_name, split(dataset)
|
||||||
fn = contingency_table[1, 0]
|
|
||||||
den = (2*tp+fp+fn)
|
|
||||||
if den>0:
|
|
||||||
return 2*tp/den
|
|
||||||
else:
|
|
||||||
return 1
|
|
||||||
|
|
||||||
|
|
||||||
def accuracy(contingency_table):
|
def gen_CAP(h, acc_fn)->[str,ClassifierAccuracyPrediction]:
|
||||||
tn = contingency_table[0, 0]
|
yield 'Naive', NaiveCAP(h, acc_fn)
|
||||||
tp = contingency_table[1, 1]
|
yield 'CT-PPS-PACC', ContTableTransferCAP(h, acc_fn, PACC(LogisticRegression()))
|
||||||
fp = contingency_table[0, 1]
|
yield 'CT-PPSh-PACC', ContTableWithHTransferCAP(h, acc_fn, PACC)
|
||||||
fn = contingency_table[1, 0]
|
|
||||||
return (tp+tn)/(tp+fp+fn+tn)
|
|
||||||
|
|
||||||
|
|
||||||
def plot_series(series, repeats, metric_name, train_prev=None, savepath=None):
|
def true_acc(h:BaseEstimator, acc_fn: callable, U: LabelledCollection):
|
||||||
|
y_pred = h.predict(U.X)
|
||||||
for key in series:
|
y_true = U.y
|
||||||
print(series[key])
|
conf_table = confusion_matrix(y_true, y_pred=y_pred, labels=U.classes_)
|
||||||
|
return acc_fn(conf_table)
|
||||||
fig, ax = plt.subplots()
|
|
||||||
|
|
||||||
def bin(v):
|
|
||||||
mat = np.asarray(v).reshape(-1, repeats)
|
|
||||||
return mat.mean(axis=1), mat.std(axis=1)
|
|
||||||
|
|
||||||
x = series['prev']
|
|
||||||
x,_ = bin(x)
|
|
||||||
|
|
||||||
for serie in series:
|
|
||||||
if serie=='prev': continue
|
|
||||||
values = series[serie]
|
|
||||||
print(serie, values)
|
|
||||||
val_mean, val_std = bin(values)
|
|
||||||
ax.errorbar(x, val_mean, label=serie, fmt='-', marker='o')
|
|
||||||
ax.fill_between(x, val_mean - val_std, val_mean + val_std, alpha=0.25)
|
|
||||||
|
|
||||||
if train_prev is not None:
|
|
||||||
ax.axvline(x=train_prev, label='tr-prev', color='k', linestyle='--')
|
|
||||||
# ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)
|
|
||||||
|
|
||||||
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
|
||||||
|
|
||||||
ax.grid()
|
|
||||||
ax.set_title(metric_name)
|
|
||||||
ax.set(xlabel='$p_U(\oplus)$', ylabel='estimated '+metric_name,
|
|
||||||
title='Classifier accuracy in terms of '+metric_name)
|
|
||||||
|
|
||||||
if savepath is None:
|
|
||||||
plt.show()
|
|
||||||
else:
|
|
||||||
os.makedirs(Path(savepath).parent, exist_ok=True)
|
|
||||||
plt.savefig(savepath, bbox_inches='tight')
|
|
||||||
|
|
||||||
|
|
||||||
dataset='imdb'
|
def acc_fn(cont_table):
|
||||||
data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=5, pickle=True)
|
return np.diag(cont_table).sum() / cont_table.sum()
|
||||||
|
|
||||||
# qp.data.preprocessing.reduce_columns(data, min_df=5, inplace=True)
|
|
||||||
# print('num_features', data.training.instances.shape[1])
|
|
||||||
|
|
||||||
train = data.training
|
|
||||||
test = data.test
|
|
||||||
|
|
||||||
upper = UpperBound(clf(), y_test=None).fit(train)
|
|
||||||
|
|
||||||
mlcfe = MLCMEstimator(clf(), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
|
||||||
|
|
||||||
emq_quant = QuantificationCMPredictor(clf(), EMQ(LogisticRegression()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
|
||||||
# cc_quant = QuantificationCMPredictor(clf(), CC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
|
||||||
# pcc_quant = QuantificationCMPredictor(clf(), PCC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
|
||||||
# acc_quant = QuantificationCMPredictor(clf(), ACC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
|
||||||
pacc_quant = QuantificationCMPredictor(clf(), PACC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
|
||||||
# hdy_quant = QuantificationCMPredictor(clf(), HDy(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
|
|
||||||
|
|
||||||
sld = EMQ(LogisticRegression()).fit(train)
|
|
||||||
pacc = PACC(clf()).fit(train)
|
|
||||||
|
|
||||||
contenders = [
|
|
||||||
('kFCV+MLPE', mlcfe),
|
|
||||||
('SLD', emq_quant),
|
|
||||||
# ('CC', cc_quant),
|
|
||||||
# ('PCC', pcc_quant),
|
|
||||||
# ('ACC', acc_quant),
|
|
||||||
('PACC', pacc_quant),
|
|
||||||
# ('HDy', hdy_quant)
|
|
||||||
]
|
|
||||||
|
|
||||||
metric = F1
|
|
||||||
# metric = accuracy
|
|
||||||
|
|
||||||
repeats = 10
|
|
||||||
with qp.util.temp_seed(42):
|
|
||||||
samples_idx = [idx for idx in test.artificial_sampling_index_generator(sample_size=500, n_prevalences=21, repeats=repeats)]
|
|
||||||
|
|
||||||
|
|
||||||
series = defaultdict(lambda: [])
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
for idx in tqdm(samples_idx, desc='generating predictions'):
|
|
||||||
sample = test.sampling_from_index(idx)
|
|
||||||
|
|
||||||
upper.show_true_labels(sample.labels)
|
h = LogisticRegression()
|
||||||
upper_conf_matrix = upper.predict(sample.instances)
|
|
||||||
metric_true = metric(upper_conf_matrix)
|
|
||||||
series['Upper'].append(metric_true)
|
|
||||||
|
|
||||||
for mname, method in contenders:
|
acc_trues = []
|
||||||
conf_matrix = method.predict(sample.instances)
|
acc_predicted = defaultdict(lambda :[])
|
||||||
estim_metric = metric(conf_matrix)
|
|
||||||
series[mname].append(estim_metric)
|
|
||||||
if hasattr(method, 'quantify'):
|
|
||||||
series[mname+'-prev'].append(method.quantify(sample.instances))
|
|
||||||
|
|
||||||
series['binsld-prev'].append(sld.quantify(sample.instances)[1])
|
|
||||||
series['binpacc-prev'].append(pacc.quantify(sample.instances)[1])
|
|
||||||
series['optimal-prev'].append(sample.prevalence()[1])
|
|
||||||
series['prev'].append(sample.prevalence()[1])
|
|
||||||
|
|
||||||
metricname = metric.__name__
|
|
||||||
plot_series(series, repeats, metric_name=metricname, train_prev=train.prevalence()[1], savepath='./plots/'+dataset+'_LinearSVC_'+metricname+'.pdf')
|
|
||||||
|
|
||||||
|
for dataset_name, (L, V, U) in gen_datasets():
|
||||||
|
print(dataset_name)
|
||||||
|
|
||||||
|
h.fit(*L.Xy)
|
||||||
|
|
||||||
|
test_prot = UPP(U, repeats=100, return_type='labelled_collection')
|
||||||
|
|
||||||
|
acc_trues.extend(true_acc(h, acc_fn, Ui) for Ui in test_prot())
|
||||||
|
|
||||||
|
for method_name, method in gen_CAP(h, acc_fn):
|
||||||
|
method.fit(V)
|
||||||
|
|
||||||
|
for Ui in test_prot():
|
||||||
|
acc_hat = method.predict(Ui.X)
|
||||||
|
acc_predicted[method_name].append(acc_hat)
|
||||||
|
|
||||||
|
acc_predicted = list(acc_predicted.items())
|
||||||
|
plot_diagonal('./plots/diagonal.png', acc_trues, acc_predicted)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,236 @@
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from sklearn import clone
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
import scipy
|
||||||
|
from scipy.sparse import issparse, csr_matrix
|
||||||
|
from data import LabelledCollection
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from sklearn.model_selection import cross_val_predict
|
||||||
|
|
||||||
|
from quapy.method.base import BaseQuantifier
|
||||||
|
from quapy.method.aggregative import PACC
|
||||||
|
|
||||||
|
|
||||||
|
class ClassifierAccuracyPrediction(ABC):
|
||||||
|
|
||||||
|
def __init__(self, h: BaseEstimator, acc: callable):
|
||||||
|
self.h = h
|
||||||
|
self.acc = acc
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def fit(self, val: LabelledCollection):
|
||||||
|
...
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
"""
|
||||||
|
Evaluates the accuracy function on the predicted contingency table
|
||||||
|
|
||||||
|
:param X: test data
|
||||||
|
:return: float
|
||||||
|
"""
|
||||||
|
return self.acc(self.predict_ct(X))
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def predict_ct(self, X):
|
||||||
|
"""
|
||||||
|
Predicts the contingency table for the test data
|
||||||
|
|
||||||
|
:param X: test data
|
||||||
|
:return: a contingency table
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class NaiveCAP(ClassifierAccuracyPrediction):
|
||||||
|
"""
|
||||||
|
The Naive CAP is a method that relies on the IID assumption, and thus uses the estimation in the validation data
|
||||||
|
as an estimate for the test data.
|
||||||
|
"""
|
||||||
|
def __init__(self, h: BaseEstimator, acc: callable):
|
||||||
|
super().__init__(h, acc)
|
||||||
|
|
||||||
|
def fit(self, val: LabelledCollection):
|
||||||
|
y_hat = self.h.predict(val.X)
|
||||||
|
y_true = val.y
|
||||||
|
self.cont_table = confusion_matrix(y_true, y_pred=y_hat, labels=val.classes_)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict_ct(self, test):
|
||||||
|
"""
|
||||||
|
This method disregards the test set, under the assumption that it is IID wrt the training. This meaning that
|
||||||
|
the confusion matrix for the test data should coincide with the one computed for training (using any cross
|
||||||
|
validation strategy).
|
||||||
|
|
||||||
|
:param test: test collection (ignored)
|
||||||
|
:return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
|
||||||
|
"""
|
||||||
|
return self.cont_table
|
||||||
|
|
||||||
|
|
||||||
|
class ContTableTransferCAP(ClassifierAccuracyPrediction):
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, h: BaseEstimator, acc: callable, q: BaseQuantifier):
|
||||||
|
super().__init__(h, acc)
|
||||||
|
self.q = q
|
||||||
|
|
||||||
|
def fit(self, val: LabelledCollection):
|
||||||
|
y_hat = self.h.predict(val.X)
|
||||||
|
y_true = val.y
|
||||||
|
self.cont_table = confusion_matrix(y_true, y_pred=y_hat, labels=val.classes_)
|
||||||
|
self.train_prev = val.prevalence()
|
||||||
|
self.q.fit(val)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict_ct(self, test):
|
||||||
|
"""
|
||||||
|
:param test: test collection (ignored)
|
||||||
|
:return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
|
||||||
|
"""
|
||||||
|
prev_hat = self.q.quantify(test)
|
||||||
|
adjustment = prev_hat / self.train_prev
|
||||||
|
return self.cont_table * adjustment[:, np.newaxis]
|
||||||
|
|
||||||
|
|
||||||
|
class ContTableWithHTransferCAP(ClassifierAccuracyPrediction):
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, h: BaseEstimator, acc: callable, q_class):
|
||||||
|
super().__init__(h, acc)
|
||||||
|
self.q = q_class(classifier=h)
|
||||||
|
|
||||||
|
def fit(self, val: LabelledCollection):
|
||||||
|
y_hat = self.h.predict(val.X)
|
||||||
|
y_true = val.y
|
||||||
|
self.cont_table = confusion_matrix(y_true, y_pred=y_hat, labels=val.classes_)
|
||||||
|
self.train_prev = val.prevalence()
|
||||||
|
self.q.fit(val, fit_classifier=False, val_split=val)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict_ct(self, test):
|
||||||
|
"""
|
||||||
|
:param test: test collection (ignored)
|
||||||
|
:return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
|
||||||
|
"""
|
||||||
|
prev_hat = self.q.quantify(test)
|
||||||
|
adjustment = prev_hat / self.train_prev
|
||||||
|
return self.cont_table * adjustment[:, np.newaxis]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class UpperBound(ClassifierAccuracyPrediction):
|
||||||
|
def __init__(self, classifier, y_test):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.y_test = y_test
|
||||||
|
|
||||||
|
def fit(self, train: LabelledCollection):
|
||||||
|
self.classifier.fit(*train.Xy)
|
||||||
|
self.classes = train.classes_
|
||||||
|
return self
|
||||||
|
|
||||||
|
def show_true_labels(self, y_test):
|
||||||
|
self.y_test = y_test
|
||||||
|
|
||||||
|
def predict(self, test):
|
||||||
|
predictions = self.classifier.predict(test)
|
||||||
|
return confusion_matrix(self.y_test, predictions, labels=self.classes)
|
||||||
|
|
||||||
|
|
||||||
|
def get_counters(y_true, y_pred):
|
||||||
|
counters = np.full(shape=y_true.shape, fill_value=-1)
|
||||||
|
counters[np.logical_and(y_true == 1, y_pred == 1)] = 0
|
||||||
|
counters[np.logical_and(y_true == 1, y_pred == 0)] = 1
|
||||||
|
counters[np.logical_and(y_true == 0, y_pred == 1)] = 2
|
||||||
|
counters[np.logical_and(y_true == 0, y_pred == 0)] = 3
|
||||||
|
class_map = {
|
||||||
|
0:'tp',
|
||||||
|
1:'fn',
|
||||||
|
2:'fp',
|
||||||
|
3:'tn'
|
||||||
|
}
|
||||||
|
return counters, class_map
|
||||||
|
|
||||||
|
|
||||||
|
def safehstack(matrix, posteriors):
|
||||||
|
if issparse(matrix):
|
||||||
|
instances = csr_matrix(scipy.sparse.hstack([matrix, posteriors]))
|
||||||
|
else:
|
||||||
|
instances = np.hstack([matrix, posteriors])
|
||||||
|
return instances
|
||||||
|
|
||||||
|
|
||||||
|
class QuantificationCMPredictor(ClassifierAccuracyPrediction):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
def __init__(self, classifier, quantifier, strategy='kfcv', **kwargs):
|
||||||
|
assert strategy in ['kfcv'], 'unknown strategy'
|
||||||
|
if strategy=='kfcv':
|
||||||
|
assert 'k' in kwargs, 'strategy "kfcv" requires "k" to be passed as an argument'
|
||||||
|
self.classifier = clone(classifier)
|
||||||
|
self.quantifier = quantifier
|
||||||
|
self.strategy = strategy
|
||||||
|
self.kwargs = kwargs
|
||||||
|
|
||||||
|
def sout(self, msg):
|
||||||
|
if 'verbose' in self.kwargs:
|
||||||
|
print(msg)
|
||||||
|
|
||||||
|
def fit(self, train: LabelledCollection):
|
||||||
|
X, y = train.Xy
|
||||||
|
if self.strategy == 'kfcv':
|
||||||
|
k=self.kwargs['k']
|
||||||
|
n_jobs = self.kwargs['n_jobs'] if 'n_jobs' in self.kwargs else 1
|
||||||
|
self.sout(f'{self.__class__.__name__}: '
|
||||||
|
f'running cross_val_predict with k={k} n_jobs={n_jobs}')
|
||||||
|
predictions = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method='predict')
|
||||||
|
posteriors = cross_val_predict(self.classifier, X, y, cv=k, n_jobs=n_jobs, method='predict_proba')
|
||||||
|
self.classifier.fit(X, y)
|
||||||
|
instances = safehstack(train.instances, posteriors)
|
||||||
|
counters, class_map = get_counters(train.labels, predictions)
|
||||||
|
q_data = LabelledCollection(instances=instances, labels=counters, classes_=[0,1,2,3])
|
||||||
|
print('counters prevalence', q_data.counts())
|
||||||
|
self.quantifier.fit(q_data)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, test):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param test: test collection (ignored)
|
||||||
|
:return: a confusion matrix in the return format of `sklearn.metrics.confusion_matrix`
|
||||||
|
"""
|
||||||
|
posteriors = self.classifier.predict_proba(test)
|
||||||
|
instances = safehstack(test, posteriors)
|
||||||
|
counters = self.quantifier.quantify(instances)
|
||||||
|
tp, fn, fp, tn = counters
|
||||||
|
conf_matrix = np.asarray([[tn, fp], [fn, tp]])
|
||||||
|
return conf_matrix
|
||||||
|
|
||||||
|
def quantify(self, test):
|
||||||
|
posteriors = self.classifier.predict_proba(test)
|
||||||
|
instances = safehstack(test, posteriors)
|
||||||
|
counters = self.quantifier.quantify(instances)
|
||||||
|
tp, fn, fp, tn = counters
|
||||||
|
den_tpr = (tp+fn)
|
||||||
|
if den_tpr>0:
|
||||||
|
tpr = tp/den_tpr
|
||||||
|
else:
|
||||||
|
tpr = 1
|
||||||
|
|
||||||
|
den_fpr = (fp+tn)
|
||||||
|
if den_fpr>0:
|
||||||
|
fpr = fp / den_fpr
|
||||||
|
else:
|
||||||
|
fpr = 0
|
||||||
|
|
||||||
|
pcc = posteriors.sum(axis=0)[1]
|
||||||
|
pacc = (pcc-fpr)/(tpr-fpr)
|
||||||
|
pacc = np.clip(pacc, 0, 1)
|
||||||
|
|
||||||
|
q = tp+fn
|
||||||
|
return q
|
|
@ -0,0 +1,17 @@
|
||||||
|
# Notes
|
||||||
|
|
||||||
|
Branch for research on classifier accuracy prediction.
|
||||||
|
|
||||||
|
I had some work done for binary (models_binary.py and main_binary.py).
|
||||||
|
I would like to approach the multiclass case directly now.
|
||||||
|
|
||||||
|
I think I will frame the problem setting as follows.
|
||||||
|
A Classifier Accuracy Prediction (CAP) method is method tha receives as input:
|
||||||
|
- h: classifier (already trained),
|
||||||
|
- V: labelled collection (for training the CAP),
|
||||||
|
- acc_func: callable: any function that works on a contingency table
|
||||||
|
|
||||||
|
And implements:
|
||||||
|
- fit: trains the CAP
|
||||||
|
- predict: predicts the evaluation measure on unseen data (provided, calls predict_ct and acc_func)
|
||||||
|
- predict_ct: predicts the contingency table
|
|
@ -0,0 +1,29 @@
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from pathlib import Path
|
||||||
|
from os import makedirs
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def plot_diagonal(outpath, xs, predictions:list):
|
||||||
|
|
||||||
|
makedirs(Path(outpath).parent, exist_ok=True)
|
||||||
|
|
||||||
|
# Create scatter plot
|
||||||
|
plt.figure(figsize=(10, 10))
|
||||||
|
plt.xlim(0, 1)
|
||||||
|
plt.ylim(0, 1)
|
||||||
|
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
|
||||||
|
|
||||||
|
for method_name, ys in predictions:
|
||||||
|
pear_cor = np.corrcoef(xs, ys)[0, 1]
|
||||||
|
plt.scatter(xs, ys, label=f'{method_name} {pear_cor:.2f}')
|
||||||
|
|
||||||
|
plt.legend()
|
||||||
|
|
||||||
|
# Add labels and title
|
||||||
|
plt.xlabel('True Accuracy')
|
||||||
|
plt.ylabel('Estimated Accuracy')
|
||||||
|
|
||||||
|
# Display the plot
|
||||||
|
# plt.show()
|
||||||
|
plt.savefig(outpath)
|
Loading…
Reference in New Issue