import os from os.path import join import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from pathlib import Path from quapy.data import LabelledCollection from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC from tqdm import tqdm from commons import configs from src.new_table import LatexTable pd.set_option('display.max_columns', None) pd.set_option('display.width', 1000) def load_data(data_path): _, nF, nA, P, nExp = Path(data_path).name.replace('.csv','').split('_') nF = int(nF.replace('nF', '')) nExp = int(nExp.replace('nExp', '')) df = pd.read_csv(data_path, index_col = 0) X_T = [] for feat_id in range(nF): Xcol = df[f'X_{feat_id}'].values X_T.append(Xcol) X = np.asarray(X_T).T y = df.Y.values areas = df.area.values return X, y, areas, nExp, df def methods(): yield 'CC', CC(classifier=LogisticRegression()) yield 'PCC', PCC(classifier=LogisticRegression()) yield 'ACC', ACC(classifier=LogisticRegression()) yield 'PACC', PACC(classifier=LogisticRegression()) yield 'EMQ', EMQ(classifier=LogisticRegression()) yield 'KDEy', KDEyML(classifier=LogisticRegression(), bandwidth=0.05) # yield 'KDEy01', KDEyML(classifier=LogisticRegression()) for config in configs: print(f'Running {config}') config_name = f'data_nF{config.n_features}_nA50_P50000_nExp100' data_path = f'./data/{config_name}.csv' result_dir = f'./results/{config_name}' os.makedirs(result_dir, exist_ok=True) X, y, A, numExperiments, df = load_data(data_path) areas = sorted(np.unique(A)) n_areas = len(areas) methods_results = [] # load baseline result from UniPI baseline_path = join(result_dir, f'risultati_EB_nF{config.n_features}.csv') if os.path.exists(baseline_path): unipi_baseline_df = pd.read_csv(baseline_path, index_col=0, sep=';') unipi_baseline_df = unipi_baseline_df.rename(columns={'AE(SAE)': 'AE'}) unipi_baseline_name = "SAE" methods_results.append(unipi_baseline_df) else: unipi_baseline_name = None # run quantification methods for q_name, quantifier in methods(): result_path = join(result_dir, f'{q_name}.csv') if os.path.exists(result_path): method_results = pd.read_csv(result_path, index_col=0) else: results = [] pbar = tqdm(range(numExperiments), total=numExperiments) for experiment_id in pbar: pbar.set_description(f'q_name={q_name}') in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool) Xtr = X[in_sample] ytr = y[in_sample] Atr = A[in_sample] # Xte = X[~in_sample] # yte = y[~in_sample] # Ate = A[~in_sample] Xte = X yte = y Ate = A train = LabelledCollection(Xtr, ytr, classes=[0, 1]) quantifier.fit(train) for area in areas: sel_te_a = Ate == area test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1]) pred_prev = quantifier.quantify(test_A.X)[1] true_prev = test_A.prevalence()[1] ae = abs(pred_prev-true_prev) results.append({ 'experiment_id': experiment_id, 'area': area, 'method': q_name, 'true-prev': true_prev, 'estim-prev': pred_prev, 'AE': ae }) method_results = pd.DataFrame(results) method_results.to_csv(result_path, index=0) methods_results.append(method_results) methods_results = pd.concat(methods_results) methods_results["area"] = methods_results["area"].astype(str).str.zfill(2) latex_table = LatexTable.from_dataframe(methods_results, method='method', benchmark='area', value='AE', name=config_name) latex_table.format.configuration.resizebox=True latex_table.format.configuration.stat_alpha = 0.01 methods_order = [m for m, _ in methods()] if unipi_baseline_name is not None: methods_order = [unipi_baseline_name] + methods_order latex_table.reorder_methods(methods_order) latex_table.latexPDF(pdf_path=join('./tables', f'{config_name}.pdf'), landscape=False) pv = methods_results.pivot_table( index='area', columns='method', values='AE', aggfunc='mean', margins=True, margins_name='Mean' ) print(pv)