From f063e4f5dc73c97d90d5974225a89dac2680f872 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Sun, 15 Jun 2025 11:57:49 +0200 Subject: [PATCH] generating data with make classification --- Census/simulated-cnr/gen_data.py | 91 +++++++++++++++++++++ Census/simulated-cnr/main.py | 124 ++++++++++++++++++++++++++++ Census/simulated-unipi/main.py | 135 +++++++++++++++++++++++++++++++ 3 files changed, 350 insertions(+) create mode 100644 Census/simulated-cnr/gen_data.py create mode 100644 Census/simulated-cnr/main.py create mode 100644 Census/simulated-unipi/main.py diff --git a/Census/simulated-cnr/gen_data.py b/Census/simulated-cnr/gen_data.py new file mode 100644 index 0000000..45c7222 --- /dev/null +++ b/Census/simulated-cnr/gen_data.py @@ -0,0 +1,91 @@ +import os +from pathlib import Path +from sklearn.datasets import make_classification +import numpy as np +from quapy.data import LabelledCollection +from quapy.protocol import UniformPrevalenceProtocol +import quapy.functional as F +import pandas as pd + +random_state = 0 + +n_features = 10 +n_areas = 50 +n_per_area = 1_000 +population_size = n_areas * n_per_area +n_experiments = 100 +n_survey = population_size//n_experiments + +print(f'{n_features=}') +print(f'{n_areas=}') +print(f'{n_per_area=}') +print(f'{population_size=}') +print(f'{n_experiments=}') +print(f'{n_survey=}') + +X, y = make_classification( + n_samples=population_size * 100, + n_features=n_features, + n_informative=n_features//2, + n_redundant=2, + n_repeated=0, + n_classes=2, + n_clusters_per_class=2, + weights=[0.5, 0.5], + flip_y=0.01, + class_sep=1.0, + hypercube=True, + shift=0.0, + scale=1.0, + shuffle=True, + random_state=random_state) + +pool = LabelledCollection(X, y, classes=[0,1]) +upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection') + +data_X = [] +data_y = [] +data_area = [] +experiment_selections = [] + +for area_id, area_sample in enumerate(upp()): + print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}') + data_X.append(area_sample.X) + data_y.append(area_sample.y) + data_area.append([area_id]*n_per_area) + +data_X = np.concatenate(data_X) +data_y = np.concatenate(data_y) +data_area = np.concatenate(data_area) + +assert len(data_area) == population_size, 'unexpected size!' + +idx = np.arange(population_size) +rand_order = np.random.permutation(population_size) +for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)): + experiment_sel = rand_order[offset_id:offset_id+n_survey] + in_sample_id = np.zeros_like(data_area) + in_sample_id[experiment_sel] = 1 + experiment_selections.append(in_sample_id) + +# compose the dataframe +data_dic = { + 'ID': idx, + 'Y': data_y, +} +for feat_id in range(n_features): + data_dic[f'X_{feat_id}'] = data_X[:,feat_id] +data_dic['area'] = data_area + +for experiment_id, experiment_selection in enumerate(experiment_selections): + data_dic[f'InSample_{experiment_id}'] = experiment_selection + +df = pd.DataFrame(data_dic) + +data_path = f'./data/data_nF{n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv' +os.makedirs(Path(data_path).parent, exist_ok=True) +df.to_csv(data_path, index=0) + + + + diff --git a/Census/simulated-cnr/main.py b/Census/simulated-cnr/main.py new file mode 100644 index 0000000..b76fe9a --- /dev/null +++ b/Census/simulated-cnr/main.py @@ -0,0 +1,124 @@ +import os +from os.path import join + +import numpy as np +import pandas as pd +from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV +from pathlib import Path +from quapy.data import LabelledCollection +from quapy.model_selection import GridSearchQ +from quapy.protocol import APP +from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC +import quapy.functional as F +from tqdm import tqdm + +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 1000) + + +def load_data(data_path): + _, nF, nA, P, nExp = Path(data_path).name.replace('.csv','').split('_') + nF = int(nF.replace('nF', '')) + nExp = int(nExp.replace('nExp', '')) + + df = pd.read_csv(data_path, index_col = 0) + + X_T = [] + for feat_id in range(nF): + Xcol = df[f'X_{feat_id}'].values + X_T.append(Xcol) + X = np.asarray(X_T).T + + y = df.Y.values + areas = df.area.values + + return X, y, areas, nExp, df + +def methods(): + yield 'CC', CC(classifier=LogisticRegression()) + yield 'PCC', PCC(classifier=LogisticRegression()) + yield 'ACC', ACC(classifier=LogisticRegression()) + yield 'PACC', PACC(classifier=LogisticRegression()) + yield 'EMQ', EMQ(classifier=LogisticRegression()) + yield 'KDEy', KDEyML(classifier=LogisticRegression(), bandwidth=0.05) + yield 'KDEy01', KDEyML(classifier=LogisticRegression()) + + +data_path = './data/data_nF10_nA50_P50000_nExp100.csv' + +config = Path(data_path).name.replace('.csv','') +result_dir = f'./results/{config}' +os.makedirs(result_dir, exist_ok=True) + +X, y, A, numExperiments, df = load_data(data_path) + +areas = sorted(np.unique(A)) +n_areas = len(areas) + +methods_results = [] + +for q_name, quantifier in methods(): + + result_path = join(result_dir, f'{q_name}.csv') + if os.path.exists(result_path): + method_results = pd.read_csv(result_path, index_col=0) + else: + results = [] + pbar = tqdm(range(numExperiments), total=numExperiments) + for experiment_id in pbar: + pbar.set_description(f'q_name={q_name}') + in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool) + + Xtr = X[in_sample] + ytr = y[in_sample] + Atr = A[in_sample] + + # Xte = X[~in_sample] + # yte = y[~in_sample] + # Ate = A[~in_sample] + + Xte = X + yte = y + Ate = A + + train = LabelledCollection(Xtr, ytr, classes=[0, 1]) + quantifier.fit(train) + + for area in areas: + sel_te_a = Ate == area + test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1]) + + pred_prev = quantifier.quantify(test_A.X)[1] + true_prev = test_A.prevalence()[1] + ae = abs(pred_prev-true_prev) + + results.append({ + 'experiment_id': experiment_id, + 'area': area, + 'method': q_name, + 'true-prev': true_prev, + 'estim-prev': pred_prev, + 'AE': ae + }) + + method_results = pd.DataFrame(results) + method_results.to_csv(result_path, index=0) + methods_results.append(method_results) + +methods_results = pd.concat(methods_results) +pv = methods_results.pivot_table( + index='area', + columns='method', + values='AE', + aggfunc='mean', + margins=True, + margins_name='Mean' +) +print(pv) + + + + + + diff --git a/Census/simulated-unipi/main.py b/Census/simulated-unipi/main.py new file mode 100644 index 0000000..111da0b --- /dev/null +++ b/Census/simulated-unipi/main.py @@ -0,0 +1,135 @@ +import numpy as np +import pandas as pd +from sklearn.calibration import CalibratedClassifierCV +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV + +from quapy.data import LabelledCollection +from quapy.model_selection import GridSearchQ +from quapy.protocol import APP +from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC +import quapy.functional as F + +def show_data(X, y=None, nbins=50): + import matplotlib.pyplot as plt + if y is None: + plt.hist(X, bins=nbins, edgecolor='black') + else: + pos = X[y==1] + neg = X[y==0] + bins = np.histogram_bin_edges(X, bins=nbins) + plt.hist(pos, bins=bins, edgecolor='black', label='positive', alpha=0.5) + plt.hist(neg, bins=bins, edgecolor='black', label='negative', alpha=0.5) + plt.xlabel('value') + plt.ylabel('frequency') + plt.show() + +df = pd.read_csv('./data/Simulated_PopulationData.csv', index_col=0) + +X = df.X.values.reshape(-1,1) +y = df.Y.values +A = df.area.values + +# X[y==1] += 2 + +show_data(X, y, nbins=50) +show_data(X, nbins=50) + +areas = sorted(np.unique(A)) +n_areas = len(areas) + +N_EXPERIMENTS=2 + +# print(list(df.columns)) + +for experiment_id in range(1, N_EXPERIMENTS+1): + in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool) + + Xtr = X[in_sample] + ytr = y[in_sample] + Atr = A[in_sample] + + show_data(Xtr, ytr) + show_data(Xtr) + + # Xte = X[~in_sample] + # yte = y[~in_sample] + # Ate = A[~in_sample] + # baseline_soft = df[f'PrCens_{experiment_id}'].values[~in_sample] + # baseline_hard = df[f'YCens_{experiment_id}'].values[~in_sample] + + Xte = X + yte = y + Ate = A + baseline_soft = df[f'PrCens_{experiment_id}'].values + baseline_hard = df[f'YCens_{experiment_id}'].values + + train = LabelledCollection(Xtr, ytr, classes=[0, 1]) + + # print(f'Experiment {experiment_id}: training prevalence = {train.prevalence()[1]:.3f}') + + q = CC(classifier=LogisticRegression()) + # q = PACC(classifier=LogisticRegression()) + # q = EMQ(classifier=LogisticRegression()) + # q = KDEyML(classifier=LogisticRegression(), bandwidth=0.001) + q = PCC(classifier=LogisticRegression(C=1)) + # q = DMy(classifier=LogisticRegression(), nbins=16) + q.fit(train) + + # tr, val = train.split_stratified(random_state=0) + # mod_sel = GridSearchQ( + # model=q, + # param_grid={ + # 'classifier__C':np.logspace(-3,3,7), + # 'classifier__class_weight':['balance', None], + # 'bandwidth': np.linspace(0.02, 0.20, 19) + # }, + # protocol=APP(data=val, sample_size=100, n_prevalences=21, repeats=10, random_state=0), + # refit=True, + # n_jobs=-1 + # ).fit(tr) + # q = mod_sel.best_model_ + + mae = [] + mae_baseline_soft = [] + mae_baseline_hard = [] + + for area in areas: + + # sel_tr_a = Atr == area + sel_te_a = Ate == area + + # train_A = LabelledCollection(Xtr[sel_tr_a], ytr[sel_tr_a], classes=[0,1]) + test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1]) + + # if np.prod(train_A.prevalence())==0: continue + + # print(f'train-prev A = {train_A.prevalence()} n_instances={len(train_A)}') + + # q = DMy(classifier=LogisticRegression()) + # q.fit(train_A) + + pred_prev = q.quantify(test_A.X)[1] + true_prev = test_A.prevalence()[1] + ae = abs(pred_prev-true_prev) + mae.append(ae) + + baseline_soft_estim = np.mean(baseline_soft[sel_te_a]) + ae_baseline_soft = abs(baseline_soft_estim-true_prev) + mae_baseline_soft.append(ae_baseline_soft) + + baseline_hard_estim = np.mean(baseline_hard[sel_te_a]) + ae_baseline_hard = abs(baseline_hard_estim - true_prev) + mae_baseline_hard.append(ae_baseline_hard) + + print(f'Area {area} true={true_prev:.2f} ' + f'baseline-soft={baseline_soft_estim:.3f} (AE={ae_baseline_soft:.3f}) ' + f'baseline-hard={baseline_hard_estim:.3f} (AE={ae_baseline_hard:.3f}) ' + f'predicted={pred_prev:.3f} (AE={ae:.3f})') + + mae = np.mean(mae) + mae_baseline_soft = np.mean(mae_baseline_soft) + mae_baseline_hard = np.mean(mae_baseline_hard) + print(f'Experiment {experiment_id} Baseline(soft)={mae_baseline_soft:.3f} Baseline(hard)={mae_baseline_hard:.3f} MAE={mae:.3f}') + + +