QuaPy/Census/classification_accuracy_1.py

74 lines
2.1 KiB
Python

import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from commons import *
from table import Table
from tqdm import tqdm
np.set_printoptions(linewidth=np.inf)
def classifiers():
yield 'LR-opt', LogisticRegressionCV(Cs=10)
yield 'LR-opt-bal', LogisticRegressionCV(class_weight='balanced', Cs=10)
yield 'LR-def', LogisticRegression()
yield 'SVM-linear', LinearSVC()
yield 'SVM-rbf', SVC(kernel='rbf')
survey_y = './data/survey_y.csv'
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
trains = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(trains)
areas = [Ai for Ai, _, _ in trains]
tables = []
text_outputs = []
benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test
methods = [f'tr-{Ai}' for Ai in areas] # areas on which a quantifier is trained
for cls_name, c in classifiers():
table = Table(name=cls_name, benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local', lower_is_better=False)
table.format.mean_prec = 4
table.format.show_std = False
table.format.stat_test = False
table.format.remove_zero = True
for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_areas):
c.fit(Xi, yi)
for j, (Aj, Xj, yj) in enumerate(trains):
if i==j: continue
pred_labels = c.predict(Xj)
true_labels = yj
acc = (pred_labels==true_labels).mean()
table.add(benchmark=f'te-{Aj}', method=f'tr-{Ai}', v=acc)
for test in benchmarks:
values = table.get_benchmark_values(test)
table.add(benchmark=test, method='Best', v=max(values))
table.add(benchmark=test, method='Worst', v=min(values))
table.add(benchmark=test, method='AVE', v=np.mean(values))
tables.append(table)
text_outputs.append(f'{cls_name} got mean {table.all_mean():.5f}')
Table.LatexPDF(f'./results/classifier/doc.pdf', tables)
with open(f'./results/classifier/output.txt', 'tw') as foo:
foo.write('\n'.join(text_outputs))