QuaPy/Census/simulated-cnr/main.py

150 lines
4.6 KiB
Python

import os
from os.path import join
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from pathlib import Path
from quapy.data import LabelledCollection
from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC
from tqdm import tqdm
from commons import configs
from src.new_table import LatexTable
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
def load_data(data_path):
_, nF, nA, P, nExp = Path(data_path).name.replace('.csv','').split('_')
nF = int(nF.replace('nF', ''))
nExp = int(nExp.replace('nExp', ''))
df = pd.read_csv(data_path, index_col = 0)
X_T = []
for feat_id in range(nF):
Xcol = df[f'X_{feat_id}'].values
X_T.append(Xcol)
X = np.asarray(X_T).T
y = df.Y.values
areas = df.area.values
return X, y, areas, nExp, df
def methods():
yield 'CC', CC(classifier=LogisticRegression())
yield 'PCC', PCC(classifier=LogisticRegression())
yield 'ACC', ACC(classifier=LogisticRegression())
yield 'PACC', PACC(classifier=LogisticRegression())
yield 'EMQ', EMQ(classifier=LogisticRegression())
yield 'KDEy', KDEyML(classifier=LogisticRegression(), bandwidth=0.05)
# yield 'KDEy01', KDEyML(classifier=LogisticRegression())
for config in configs:
print(f'Running {config}')
config_name = f'data_nF{config.n_features}_nA50_P50000_nExp100'
data_path = f'./data/{config_name}.csv'
result_dir = f'./results/{config_name}'
os.makedirs(result_dir, exist_ok=True)
X, y, A, numExperiments, df = load_data(data_path)
areas = sorted(np.unique(A))
n_areas = len(areas)
methods_results = []
# load baseline result from UniPI
baseline_path = join(result_dir, 'Risultati_SAE.csv')
if os.path.exists(baseline_path):
unipi_baseline_df = pd.read_csv(baseline_path, index_col=0, sep=';')
unipi_baseline_df = unipi_baseline_df.rename(columns={'AE(SAE)': 'AE'})
unipi_baseline_name = "SAE"
methods_results.append(unipi_baseline_df)
else:
unipi_baseline_name = None
# run quantification methods
for q_name, quantifier in methods():
result_path = join(result_dir, f'{q_name}.csv')
if os.path.exists(result_path):
method_results = pd.read_csv(result_path, index_col=0)
else:
results = []
pbar = tqdm(range(numExperiments), total=numExperiments)
for experiment_id in pbar:
pbar.set_description(f'q_name={q_name}')
in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool)
Xtr = X[in_sample]
ytr = y[in_sample]
Atr = A[in_sample]
# Xte = X[~in_sample]
# yte = y[~in_sample]
# Ate = A[~in_sample]
Xte = X
yte = y
Ate = A
train = LabelledCollection(Xtr, ytr, classes=[0, 1])
quantifier.fit(train)
for area in areas:
sel_te_a = Ate == area
test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1])
pred_prev = quantifier.quantify(test_A.X)[1]
true_prev = test_A.prevalence()[1]
ae = abs(pred_prev-true_prev)
results.append({
'experiment_id': experiment_id,
'area': area,
'method': q_name,
'true-prev': true_prev,
'estim-prev': pred_prev,
'AE': ae
})
method_results = pd.DataFrame(results)
method_results.to_csv(result_path, index=0)
methods_results.append(method_results)
methods_results = pd.concat(methods_results)
methods_results["area"] = methods_results["area"].astype(str).str.zfill(2)
latex_table = LatexTable.from_dataframe(methods_results, method='method', benchmark='area', value='AE')
latex_table.format.configuration.resizebox=True
methods_order = [m for m, _ in methods()]
if unipi_baseline_name is not None:
methods_order = [unipi_baseline_name] + methods_order
latex_table.reorder_methods(methods_order)
latex_table.latexPDF(pdf_path=join('./tables', f'{config_name}.pdf'), tabular_dir=f'tabular_{config_name}', landscape=False)
pv = methods_results.pivot_table(
index='area',
columns='method',
values='AE',
aggfunc='mean',
margins=True,
margins_name='Mean'
)
print(pv)