QuaPy/ClassifierAccuracy/main.py

150 lines
4.6 KiB
Python
Raw Normal View History

2024-02-23 16:55:14 +01:00
from collections import defaultdict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
import os
import quapy as qp
from method.aggregative import PACC, EMQ, PCC, CC, ACC, HDy
from models import *
import matplotlib.pyplot as plt
from pathlib import Path
def clf():
# return CalibratedClassifierCV(LinearSVC(class_weight=None))
return LogisticRegression(class_weight=None)
def F1(contingency_table):
# tn = contingency_table[0, 0]
tp = contingency_table[1, 1]
fp = contingency_table[0, 1]
fn = contingency_table[1, 0]
den = (2*tp+fp+fn)
if den>0:
return 2*tp/den
else:
return 1
def accuracy(contingency_table):
tn = contingency_table[0, 0]
tp = contingency_table[1, 1]
fp = contingency_table[0, 1]
fn = contingency_table[1, 0]
return (tp+tn)/(tp+fp+fn+tn)
def plot_series(series, repeats, metric_name, train_prev=None, savepath=None):
for key in series:
print(series[key])
fig, ax = plt.subplots()
def bin(v):
mat = np.asarray(v).reshape(-1, repeats)
return mat.mean(axis=1), mat.std(axis=1)
x = series['prev']
x,_ = bin(x)
for serie in series:
if serie=='prev': continue
values = series[serie]
print(serie, values)
val_mean, val_std = bin(values)
ax.errorbar(x, val_mean, label=serie, fmt='-', marker='o')
ax.fill_between(x, val_mean - val_std, val_mean + val_std, alpha=0.25)
if train_prev is not None:
ax.axvline(x=train_prev, label='tr-prev', color='k', linestyle='--')
# ax.scatter(train_prev, train_prev, c='c', label='tr-prev', linewidth=2, edgecolor='k', s=100, zorder=3)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax.grid()
ax.set_title(metric_name)
ax.set(xlabel='$p_U(\oplus)$', ylabel='estimated '+metric_name,
title='Classifier accuracy in terms of '+metric_name)
if savepath is None:
plt.show()
else:
os.makedirs(Path(savepath).parent, exist_ok=True)
plt.savefig(savepath, bbox_inches='tight')
dataset='imdb'
data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=5, pickle=True)
# qp.data.preprocessing.reduce_columns(data, min_df=5, inplace=True)
# print('num_features', data.training.instances.shape[1])
train = data.training
test = data.test
upper = UpperBound(clf(), y_test=None).fit(train)
mlcfe = MLCMEstimator(clf(), strategy='kfcv', k=5, n_jobs=-1).fit(train)
emq_quant = QuantificationCMPredictor(clf(), EMQ(LogisticRegression()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
# cc_quant = QuantificationCMPredictor(clf(), CC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
# pcc_quant = QuantificationCMPredictor(clf(), PCC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
# acc_quant = QuantificationCMPredictor(clf(), ACC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
pacc_quant = QuantificationCMPredictor(clf(), PACC(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
# hdy_quant = QuantificationCMPredictor(clf(), HDy(clf()), strategy='kfcv', k=5, n_jobs=-1).fit(train)
sld = EMQ(LogisticRegression()).fit(train)
pacc = PACC(clf()).fit(train)
contenders = [
('kFCV+MLPE', mlcfe),
('SLD', emq_quant),
# ('CC', cc_quant),
# ('PCC', pcc_quant),
# ('ACC', acc_quant),
('PACC', pacc_quant),
# ('HDy', hdy_quant)
]
metric = F1
# metric = accuracy
repeats = 10
with qp.util.temp_seed(42):
samples_idx = [idx for idx in test.artificial_sampling_index_generator(sample_size=500, n_prevalences=21, repeats=repeats)]
series = defaultdict(lambda: [])
for idx in tqdm(samples_idx, desc='generating predictions'):
sample = test.sampling_from_index(idx)
upper.show_true_labels(sample.labels)
upper_conf_matrix = upper.predict(sample.instances)
metric_true = metric(upper_conf_matrix)
series['Upper'].append(metric_true)
for mname, method in contenders:
conf_matrix = method.predict(sample.instances)
estim_metric = metric(conf_matrix)
series[mname].append(estim_metric)
if hasattr(method, 'quantify'):
series[mname+'-prev'].append(method.quantify(sample.instances))
series['binsld-prev'].append(sld.quantify(sample.instances)[1])
series['binpacc-prev'].append(pacc.quantify(sample.instances)[1])
series['optimal-prev'].append(sample.prevalence()[1])
series['prev'].append(sample.prevalence()[1])
metricname = metric.__name__
plot_series(series, repeats, metric_name=metricname, train_prev=train.prevalence()[1], savepath='./plots/'+dataset+'_LinearSVC_'+metricname+'.pdf')