From 72aac9198960e37a228d8f7597fb2a0df81b8650 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sun, 25 May 2025 12:37:05 +0200 Subject: [PATCH] new dataset not working --- Census/{ => real}/adjacentconcat_4.py | 0 Census/{ => real}/adjacentmedian_4.1.py | 0 Census/{ => real}/adjacentmedianoptim_4.2.py | 0 Census/{ => real}/allconcat_3.py | 0 Census/{ => real}/allmedian_3.1.py | 0 .../{ => real}/classification_accuracy_1.py | 0 Census/{ => real}/commons.py | 0 Census/{ => real}/main.py | 0 Census/{ => real}/methods.py | 0 Census/{ => real}/pairwise_2.py | 0 Census/{ => real}/table.py | 0 Census/{ => real}/tmp.py | 0 Census/simulated/main.py | 133 ++++++++++++++++++ 13 files changed, 133 insertions(+) rename Census/{ => real}/adjacentconcat_4.py (100%) rename Census/{ => real}/adjacentmedian_4.1.py (100%) rename Census/{ => real}/adjacentmedianoptim_4.2.py (100%) rename Census/{ => real}/allconcat_3.py (100%) rename Census/{ => real}/allmedian_3.1.py (100%) rename Census/{ => real}/classification_accuracy_1.py (100%) rename Census/{ => real}/commons.py (100%) rename Census/{ => real}/main.py (100%) rename Census/{ => real}/methods.py (100%) rename Census/{ => real}/pairwise_2.py (100%) rename Census/{ => real}/table.py (100%) rename Census/{ => real}/tmp.py (100%) create mode 100644 Census/simulated/main.py diff --git a/Census/adjacentconcat_4.py b/Census/real/adjacentconcat_4.py similarity index 100% rename from Census/adjacentconcat_4.py rename to Census/real/adjacentconcat_4.py diff --git a/Census/adjacentmedian_4.1.py b/Census/real/adjacentmedian_4.1.py similarity index 100% rename from Census/adjacentmedian_4.1.py rename to Census/real/adjacentmedian_4.1.py diff --git a/Census/adjacentmedianoptim_4.2.py b/Census/real/adjacentmedianoptim_4.2.py similarity index 100% rename from Census/adjacentmedianoptim_4.2.py rename to Census/real/adjacentmedianoptim_4.2.py diff --git a/Census/allconcat_3.py b/Census/real/allconcat_3.py similarity index 100% rename from Census/allconcat_3.py rename to Census/real/allconcat_3.py diff --git a/Census/allmedian_3.1.py b/Census/real/allmedian_3.1.py similarity index 100% rename from Census/allmedian_3.1.py rename to Census/real/allmedian_3.1.py diff --git a/Census/classification_accuracy_1.py b/Census/real/classification_accuracy_1.py similarity index 100% rename from Census/classification_accuracy_1.py rename to Census/real/classification_accuracy_1.py diff --git a/Census/commons.py b/Census/real/commons.py similarity index 100% rename from Census/commons.py rename to Census/real/commons.py diff --git a/Census/main.py b/Census/real/main.py similarity index 100% rename from Census/main.py rename to Census/real/main.py diff --git a/Census/methods.py b/Census/real/methods.py similarity index 100% rename from Census/methods.py rename to Census/real/methods.py diff --git a/Census/pairwise_2.py b/Census/real/pairwise_2.py similarity index 100% rename from Census/pairwise_2.py rename to Census/real/pairwise_2.py diff --git a/Census/table.py b/Census/real/table.py similarity index 100% rename from Census/table.py rename to Census/real/table.py diff --git a/Census/tmp.py b/Census/real/tmp.py similarity index 100% rename from Census/tmp.py rename to Census/real/tmp.py diff --git a/Census/simulated/main.py b/Census/simulated/main.py new file mode 100644 index 0000000..c08e501 --- /dev/null +++ b/Census/simulated/main.py @@ -0,0 +1,133 @@ +import numpy as np +import pandas as pd +from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV + +from quapy.data import LabelledCollection +from quapy.model_selection import GridSearchQ +from quapy.protocol import APP +from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC +import quapy.functional as F + +def show_data(X, y=None, nbins=50): + import matplotlib.pyplot as plt + if y is None: + plt.hist(X, bins=nbins, edgecolor='black') + else: + pos = X[y==1] + neg = X[y==0] + bins = np.histogram_bin_edges(X, bins=nbins) + plt.hist(pos, bins=bins, edgecolor='black', label='positive', alpha=0.5) + plt.hist(neg, bins=bins, edgecolor='black', label='negative', alpha=0.5) + plt.xlabel('value') + plt.ylabel('frequency') + plt.show() + +df = pd.read_csv('./data/Simulated_PopulationData.csv', index_col=0) + +X = df.X.values.reshape(-1,1) +y = df.Y.values +A = df.area.values + +# X[y==1] += 2 + +# show_data(X, y, nbins=50) + +areas = sorted(np.unique(A)) +n_areas = len(areas) + +N_EXPERIMENTS=1 + +# print(list(df.columns)) + +for experiment_id in range(1, N_EXPERIMENTS+1): + in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool) + + Xtr = X[in_sample] + ytr = y[in_sample] + Atr = A[in_sample] + + # show_data(Xtr, ytr) + + # Xte = X[~in_sample] + # yte = y[~in_sample] + # Ate = A[~in_sample] + # baseline_soft = df[f'PrCens_{experiment_id}'].values[~in_sample] + # baseline_hard = df[f'YCens_{experiment_id}'].values[~in_sample] + + Xte = X + yte = y + Ate = A + baseline_soft = df[f'PrCens_{experiment_id}'].values + baseline_hard = df[f'YCens_{experiment_id}'].values + + train = LabelledCollection(Xtr, ytr, classes=[0, 1]) + + # print(f'Experiment {experiment_id}: training prevalence = {train.prevalence()[1]:.3f}') + + q = CC(classifier=LogisticRegression()) + # q = PACC(classifier=LogisticRegression()) + # q = EMQ(classifier=LogisticRegression()) + # q = KDEyML(classifier=LogisticRegression(), bandwidth=0.001) + q = PCC(classifier=LogisticRegression(C=1)) + # q = DMy(classifier=LogisticRegression(), nbins=16) + q.fit(train) + + # tr, val = train.split_stratified(random_state=0) + # mod_sel = GridSearchQ( + # model=q, + # param_grid={ + # 'classifier__C':np.logspace(-3,3,7), + # 'classifier__class_weight':['balance', None], + # 'bandwidth': np.linspace(0.02, 0.20, 19) + # }, + # protocol=APP(data=val, sample_size=100, n_prevalences=21, repeats=10, random_state=0), + # refit=True, + # n_jobs=-1 + # ).fit(tr) + # q = mod_sel.best_model_ + + mae = [] + mae_baseline_soft = [] + mae_baseline_hard = [] + + for area in areas: + + # sel_tr_a = Atr == area + sel_te_a = Ate == area + + # train_A = LabelledCollection(Xtr[sel_tr_a], ytr[sel_tr_a], classes=[0,1]) + test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1]) + + # if np.prod(train_A.prevalence())==0: continue + + # print(f'train-prev A = {train_A.prevalence()} n_instances={len(train_A)}') + + # q = DMy(classifier=LogisticRegression()) + # q.fit(train_A) + + pred_prev = q.quantify(test_A.X)[1] + true_prev = test_A.prevalence()[1] + ae = abs(pred_prev-true_prev) + mae.append(ae) + + baseline_soft_estim = np.mean(baseline_soft[sel_te_a]) + ae_baseline_soft = abs(baseline_soft_estim-true_prev) + mae_baseline_soft.append(ae_baseline_soft) + + baseline_hard_estim = np.mean(baseline_hard[sel_te_a]) + ae_baseline_hard = abs(baseline_hard_estim - true_prev) + mae_baseline_hard.append(ae_baseline_hard) + + print(f'Area {area} true={true_prev:.2f} ' + f'baseline-soft={baseline_soft_estim:.3f} (AE={ae_baseline_soft:.3f}) ' + f'baseline-hard={baseline_hard_estim:.3f} (AE={ae_baseline_hard:.3f}) ' + f'predicted={pred_prev:.3f} (AE={ae:.3f})') + + mae = np.mean(mae) + mae_baseline_soft = np.mean(mae_baseline_soft) + mae_baseline_hard = np.mean(mae_baseline_hard) + print(f'Experiment {experiment_id} Baseline(soft)={mae_baseline_soft:.3f} Baseline(hard)={mae_baseline_hard:.3f} MAE={mae:.3f}') + + +