150 lines
4.6 KiB
Python
150 lines
4.6 KiB
Python
import os
|
|
from os.path import join
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
|
from pathlib import Path
|
|
from quapy.data import LabelledCollection
|
|
from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC
|
|
from tqdm import tqdm
|
|
from commons import configs
|
|
|
|
from src.new_table import LatexTable
|
|
|
|
pd.set_option('display.max_columns', None)
|
|
pd.set_option('display.width', 1000)
|
|
|
|
|
|
def load_data(data_path):
|
|
_, nF, nA, P, nExp = Path(data_path).name.replace('.csv','').split('_')
|
|
nF = int(nF.replace('nF', ''))
|
|
nExp = int(nExp.replace('nExp', ''))
|
|
|
|
df = pd.read_csv(data_path, index_col = 0)
|
|
|
|
X_T = []
|
|
for feat_id in range(nF):
|
|
Xcol = df[f'X_{feat_id}'].values
|
|
X_T.append(Xcol)
|
|
X = np.asarray(X_T).T
|
|
|
|
y = df.Y.values
|
|
areas = df.area.values
|
|
|
|
return X, y, areas, nExp, df
|
|
|
|
def methods():
|
|
yield 'CC', CC(classifier=LogisticRegression())
|
|
yield 'PCC', PCC(classifier=LogisticRegression())
|
|
yield 'ACC', ACC(classifier=LogisticRegression())
|
|
yield 'PACC', PACC(classifier=LogisticRegression())
|
|
yield 'EMQ', EMQ(classifier=LogisticRegression())
|
|
yield 'KDEy', KDEyML(classifier=LogisticRegression(), bandwidth=0.05)
|
|
# yield 'KDEy01', KDEyML(classifier=LogisticRegression())
|
|
|
|
|
|
for config in configs:
|
|
|
|
print(f'Running {config}')
|
|
|
|
config_name = f'data_nF{config.n_features}_nA50_P50000_nExp100'
|
|
data_path = f'./data/{config_name}.csv'
|
|
|
|
result_dir = f'./results/{config_name}'
|
|
os.makedirs(result_dir, exist_ok=True)
|
|
|
|
X, y, A, numExperiments, df = load_data(data_path)
|
|
|
|
areas = sorted(np.unique(A))
|
|
n_areas = len(areas)
|
|
|
|
methods_results = []
|
|
|
|
# load baseline result from UniPI
|
|
baseline_path = join(result_dir, 'Risultati_SAE.csv')
|
|
if os.path.exists(baseline_path):
|
|
unipi_baseline_df = pd.read_csv(baseline_path, index_col=0, sep=';')
|
|
unipi_baseline_df = unipi_baseline_df.rename(columns={'AE(SAE)': 'AE'})
|
|
unipi_baseline_name = "SAE"
|
|
methods_results.append(unipi_baseline_df)
|
|
else:
|
|
unipi_baseline_name = None
|
|
|
|
# run quantification methods
|
|
for q_name, quantifier in methods():
|
|
result_path = join(result_dir, f'{q_name}.csv')
|
|
if os.path.exists(result_path):
|
|
method_results = pd.read_csv(result_path, index_col=0)
|
|
else:
|
|
results = []
|
|
pbar = tqdm(range(numExperiments), total=numExperiments)
|
|
for experiment_id in pbar:
|
|
pbar.set_description(f'q_name={q_name}')
|
|
in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool)
|
|
|
|
Xtr = X[in_sample]
|
|
ytr = y[in_sample]
|
|
Atr = A[in_sample]
|
|
|
|
# Xte = X[~in_sample]
|
|
# yte = y[~in_sample]
|
|
# Ate = A[~in_sample]
|
|
|
|
Xte = X
|
|
yte = y
|
|
Ate = A
|
|
|
|
train = LabelledCollection(Xtr, ytr, classes=[0, 1])
|
|
quantifier.fit(train)
|
|
|
|
for area in areas:
|
|
sel_te_a = Ate == area
|
|
test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1])
|
|
|
|
pred_prev = quantifier.quantify(test_A.X)[1]
|
|
true_prev = test_A.prevalence()[1]
|
|
ae = abs(pred_prev-true_prev)
|
|
|
|
results.append({
|
|
'experiment_id': experiment_id,
|
|
'area': area,
|
|
'method': q_name,
|
|
'true-prev': true_prev,
|
|
'estim-prev': pred_prev,
|
|
'AE': ae
|
|
})
|
|
|
|
method_results = pd.DataFrame(results)
|
|
method_results.to_csv(result_path, index=0)
|
|
methods_results.append(method_results)
|
|
|
|
methods_results = pd.concat(methods_results)
|
|
|
|
methods_results["area"] = methods_results["area"].astype(str).str.zfill(2)
|
|
latex_table = LatexTable.from_dataframe(methods_results, method='method', benchmark='area', value='AE')
|
|
latex_table.format.configuration.resizebox=True
|
|
|
|
methods_order = [m for m, _ in methods()]
|
|
if unipi_baseline_name is not None:
|
|
methods_order = [unipi_baseline_name] + methods_order
|
|
|
|
latex_table.reorder_methods(methods_order)
|
|
latex_table.latexPDF(pdf_path=join('./tables', f'{config_name}.pdf'), tabular_dir=f'tabular_{config_name}', landscape=False)
|
|
|
|
pv = methods_results.pivot_table(
|
|
index='area',
|
|
columns='method',
|
|
values='AE',
|
|
aggfunc='mean',
|
|
margins=True,
|
|
margins_name='Mean'
|
|
)
|
|
print(pv)
|
|
|
|
|
|
|
|
|
|
|
|
|