import os from os.path import join import numpy as np import pandas as pd from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from pathlib import Path from quapy.data import LabelledCollection from quapy.model_selection import GridSearchQ from quapy.protocol import APP from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC import quapy.functional as F from tqdm import tqdm pd.set_option('display.max_columns', None) pd.set_option('display.width', 1000) def load_data(data_path): _, nF, nA, P, nExp = Path(data_path).name.replace('.csv','').split('_') nF = int(nF.replace('nF', '')) nExp = int(nExp.replace('nExp', '')) df = pd.read_csv(data_path, index_col = 0) X_T = [] for feat_id in range(nF): Xcol = df[f'X_{feat_id}'].values X_T.append(Xcol) X = np.asarray(X_T).T y = df.Y.values areas = df.area.values return X, y, areas, nExp, df def methods(): yield 'CC', CC(classifier=LogisticRegression()) yield 'PCC', PCC(classifier=LogisticRegression()) yield 'ACC', ACC(classifier=LogisticRegression()) yield 'PACC', PACC(classifier=LogisticRegression()) yield 'EMQ', EMQ(classifier=LogisticRegression()) yield 'KDEy', KDEyML(classifier=LogisticRegression(), bandwidth=0.05) yield 'KDEy01', KDEyML(classifier=LogisticRegression()) data_path = './data/data_nF10_nA50_P50000_nExp100.csv' config = Path(data_path).name.replace('.csv','') result_dir = f'./results/{config}' os.makedirs(result_dir, exist_ok=True) X, y, A, numExperiments, df = load_data(data_path) areas = sorted(np.unique(A)) n_areas = len(areas) methods_results = [] for q_name, quantifier in methods(): result_path = join(result_dir, f'{q_name}.csv') if os.path.exists(result_path): method_results = pd.read_csv(result_path, index_col=0) else: results = [] pbar = tqdm(range(numExperiments), total=numExperiments) for experiment_id in pbar: pbar.set_description(f'q_name={q_name}') in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool) Xtr = X[in_sample] ytr = y[in_sample] Atr = A[in_sample] # Xte = X[~in_sample] # yte = y[~in_sample] # Ate = A[~in_sample] Xte = X yte = y Ate = A train = LabelledCollection(Xtr, ytr, classes=[0, 1]) quantifier.fit(train) for area in areas: sel_te_a = Ate == area test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1]) pred_prev = quantifier.quantify(test_A.X)[1] true_prev = test_A.prevalence()[1] ae = abs(pred_prev-true_prev) results.append({ 'experiment_id': experiment_id, 'area': area, 'method': q_name, 'true-prev': true_prev, 'estim-prev': pred_prev, 'AE': ae }) method_results = pd.DataFrame(results) method_results.to_csv(result_path, index=0) methods_results.append(method_results) methods_results = pd.concat(methods_results) pv = methods_results.pivot_table( index='area', columns='method', values='AE', aggfunc='mean', margins=True, margins_name='Mean' ) print(pv)