diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..26000e8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "result_path"] + path = result_path + url = gitea@gitea-s2i2s.isti.cnr.it:moreo/result_table.git diff --git a/Census/simulated-cnr/commons.py b/Census/simulated-cnr/commons.py new file mode 100644 index 0000000..47cb880 --- /dev/null +++ b/Census/simulated-cnr/commons.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass + + +@dataclass +class DataConfig: + n_features: int + n_informative: int + n_redundant: int + n_clusters_per_class: int + flip_y:float + +config_easy = DataConfig(n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, flip_y=0.0) +config_medium = DataConfig(n_features=10, n_informative=5, n_redundant=2, n_clusters_per_class=2, flip_y=0.01) +config_hard = DataConfig(n_features=50, n_informative=20, n_redundant=10, n_clusters_per_class=4, flip_y=0.05) +config_veryhard = DataConfig(n_features=100, n_informative=30, n_redundant=20, n_clusters_per_class=8, flip_y=0.05) + +configs = [ + config_easy, + config_medium, + config_hard, + config_veryhard +] \ No newline at end of file diff --git a/Census/simulated-cnr/gen_data.py b/Census/simulated-cnr/gen_data.py index 45c7222..ef715d7 100644 --- a/Census/simulated-cnr/gen_data.py +++ b/Census/simulated-cnr/gen_data.py @@ -6,85 +6,88 @@ from quapy.data import LabelledCollection from quapy.protocol import UniformPrevalenceProtocol import quapy.functional as F import pandas as pd +from commons import configs random_state = 0 -n_features = 10 + n_areas = 50 n_per_area = 1_000 population_size = n_areas * n_per_area n_experiments = 100 n_survey = population_size//n_experiments -print(f'{n_features=}') -print(f'{n_areas=}') -print(f'{n_per_area=}') -print(f'{population_size=}') -print(f'{n_experiments=}') -print(f'{n_survey=}') -X, y = make_classification( - n_samples=population_size * 100, - n_features=n_features, - n_informative=n_features//2, - n_redundant=2, - n_repeated=0, - n_classes=2, - n_clusters_per_class=2, - weights=[0.5, 0.5], - flip_y=0.01, - class_sep=1.0, - hypercube=True, - shift=0.0, - scale=1.0, - shuffle=True, - random_state=random_state) +for config in configs: + print(f'{config.n_features=}') + print(f'{n_areas=}') + print(f'{n_per_area=}') + print(f'{population_size=}') + print(f'{n_experiments=}') + print(f'{n_survey=}') -pool = LabelledCollection(X, y, classes=[0,1]) -upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection') + X, y = make_classification( + n_samples=population_size * 100, + n_features=config.n_features, + n_informative=config.n_informative, + n_redundant=config.n_redundant, + n_repeated=0, + n_classes=2, + n_clusters_per_class=config.n_clusters_per_class, + weights=[0.5, 0.5], + flip_y=config.flip_y, + class_sep=1.0, + hypercube=True, + shift=0.0, + scale=1.0, + shuffle=True, + random_state=random_state) -data_X = [] -data_y = [] -data_area = [] -experiment_selections = [] + pool = LabelledCollection(X, y, classes=[0,1]) + upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection') -for area_id, area_sample in enumerate(upp()): - print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}') - data_X.append(area_sample.X) - data_y.append(area_sample.y) - data_area.append([area_id]*n_per_area) + data_X = [] + data_y = [] + data_area = [] + experiment_selections = [] -data_X = np.concatenate(data_X) -data_y = np.concatenate(data_y) -data_area = np.concatenate(data_area) + for area_id, area_sample in enumerate(upp()): + print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}') + data_X.append(area_sample.X) + data_y.append(area_sample.y) + data_area.append([area_id]*n_per_area) -assert len(data_area) == population_size, 'unexpected size!' + data_X = np.concatenate(data_X) + data_y = np.concatenate(data_y) + data_area = np.concatenate(data_area) -idx = np.arange(population_size) -rand_order = np.random.permutation(population_size) -for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)): - experiment_sel = rand_order[offset_id:offset_id+n_survey] - in_sample_id = np.zeros_like(data_area) - in_sample_id[experiment_sel] = 1 - experiment_selections.append(in_sample_id) + assert len(data_area) == population_size, 'unexpected size!' -# compose the dataframe -data_dic = { - 'ID': idx, - 'Y': data_y, -} -for feat_id in range(n_features): - data_dic[f'X_{feat_id}'] = data_X[:,feat_id] -data_dic['area'] = data_area + idx = np.arange(population_size) + rand_order = np.random.permutation(population_size) + for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)): + experiment_sel = rand_order[offset_id:offset_id+n_survey] + in_sample_id = np.zeros_like(data_area) + in_sample_id[experiment_sel] = 1 + experiment_selections.append(in_sample_id) -for experiment_id, experiment_selection in enumerate(experiment_selections): - data_dic[f'InSample_{experiment_id}'] = experiment_selection + # compose the dataframe + data_dic = { + 'ID': idx, + 'Y': data_y, + } + for feat_id in range(config.n_features): + data_dic[f'X_{feat_id}'] = data_X[:,feat_id] + data_dic['area'] = data_area -df = pd.DataFrame(data_dic) + for experiment_id, experiment_selection in enumerate(experiment_selections): + data_dic[f'InSample_{experiment_id}'] = experiment_selection -data_path = f'./data/data_nF{n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv' -os.makedirs(Path(data_path).parent, exist_ok=True) -df.to_csv(data_path, index=0) + df = pd.DataFrame(data_dic) + + data_path = f'./data/data_nF{config.n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv' + os.makedirs(Path(data_path).parent, exist_ok=True) + df.to_csv(data_path, index=0) diff --git a/Census/simulated-cnr/main.py b/Census/simulated-cnr/main.py index b76fe9a..4610d26 100644 --- a/Census/simulated-cnr/main.py +++ b/Census/simulated-cnr/main.py @@ -3,15 +3,14 @@ from os.path import join import numpy as np import pandas as pd -from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from pathlib import Path from quapy.data import LabelledCollection -from quapy.model_selection import GridSearchQ -from quapy.protocol import APP from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC -import quapy.functional as F from tqdm import tqdm +from commons import configs + +from src.new_table import LatexTable pd.set_option('display.max_columns', None) pd.set_option('display.width', 1000) @@ -42,80 +41,106 @@ def methods(): yield 'PACC', PACC(classifier=LogisticRegression()) yield 'EMQ', EMQ(classifier=LogisticRegression()) yield 'KDEy', KDEyML(classifier=LogisticRegression(), bandwidth=0.05) - yield 'KDEy01', KDEyML(classifier=LogisticRegression()) + # yield 'KDEy01', KDEyML(classifier=LogisticRegression()) -data_path = './data/data_nF10_nA50_P50000_nExp100.csv' +for config in configs: -config = Path(data_path).name.replace('.csv','') -result_dir = f'./results/{config}' -os.makedirs(result_dir, exist_ok=True) + print(f'Running {config}') -X, y, A, numExperiments, df = load_data(data_path) + config_name = f'data_nF{config.n_features}_nA50_P50000_nExp100' + data_path = f'./data/{config_name}.csv' -areas = sorted(np.unique(A)) -n_areas = len(areas) + result_dir = f'./results/{config_name}' + os.makedirs(result_dir, exist_ok=True) -methods_results = [] + X, y, A, numExperiments, df = load_data(data_path) -for q_name, quantifier in methods(): + areas = sorted(np.unique(A)) + n_areas = len(areas) - result_path = join(result_dir, f'{q_name}.csv') - if os.path.exists(result_path): - method_results = pd.read_csv(result_path, index_col=0) + methods_results = [] + + # load baseline result from UniPI + baseline_path = join(result_dir, 'Risultati_SAE.csv') + if os.path.exists(baseline_path): + unipi_baseline_df = pd.read_csv(baseline_path, index_col=0, sep=';') + unipi_baseline_df = unipi_baseline_df.rename(columns={'AE(SAE)': 'AE'}) + unipi_baseline_name = "SAE" + methods_results.append(unipi_baseline_df) else: - results = [] - pbar = tqdm(range(numExperiments), total=numExperiments) - for experiment_id in pbar: - pbar.set_description(f'q_name={q_name}') - in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool) + unipi_baseline_name = None - Xtr = X[in_sample] - ytr = y[in_sample] - Atr = A[in_sample] + # run quantification methods + for q_name, quantifier in methods(): + result_path = join(result_dir, f'{q_name}.csv') + if os.path.exists(result_path): + method_results = pd.read_csv(result_path, index_col=0) + else: + results = [] + pbar = tqdm(range(numExperiments), total=numExperiments) + for experiment_id in pbar: + pbar.set_description(f'q_name={q_name}') + in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool) - # Xte = X[~in_sample] - # yte = y[~in_sample] - # Ate = A[~in_sample] + Xtr = X[in_sample] + ytr = y[in_sample] + Atr = A[in_sample] - Xte = X - yte = y - Ate = A + # Xte = X[~in_sample] + # yte = y[~in_sample] + # Ate = A[~in_sample] - train = LabelledCollection(Xtr, ytr, classes=[0, 1]) - quantifier.fit(train) + Xte = X + yte = y + Ate = A - for area in areas: - sel_te_a = Ate == area - test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1]) + train = LabelledCollection(Xtr, ytr, classes=[0, 1]) + quantifier.fit(train) - pred_prev = quantifier.quantify(test_A.X)[1] - true_prev = test_A.prevalence()[1] - ae = abs(pred_prev-true_prev) + for area in areas: + sel_te_a = Ate == area + test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1]) - results.append({ - 'experiment_id': experiment_id, - 'area': area, - 'method': q_name, - 'true-prev': true_prev, - 'estim-prev': pred_prev, - 'AE': ae - }) + pred_prev = quantifier.quantify(test_A.X)[1] + true_prev = test_A.prevalence()[1] + ae = abs(pred_prev-true_prev) - method_results = pd.DataFrame(results) - method_results.to_csv(result_path, index=0) - methods_results.append(method_results) + results.append({ + 'experiment_id': experiment_id, + 'area': area, + 'method': q_name, + 'true-prev': true_prev, + 'estim-prev': pred_prev, + 'AE': ae + }) -methods_results = pd.concat(methods_results) -pv = methods_results.pivot_table( - index='area', - columns='method', - values='AE', - aggfunc='mean', - margins=True, - margins_name='Mean' -) -print(pv) + method_results = pd.DataFrame(results) + method_results.to_csv(result_path, index=0) + methods_results.append(method_results) + + methods_results = pd.concat(methods_results) + + methods_results["area"] = methods_results["area"].astype(str).str.zfill(2) + latex_table = LatexTable.from_dataframe(methods_results, method='method', benchmark='area', value='AE') + latex_table.format.configuration.resizebox=True + + methods_order = [m for m, _ in methods()] + if unipi_baseline_name is not None: + methods_order = [unipi_baseline_name] + methods_order + + latex_table.reorder_methods(methods_order) + latex_table.latexPDF(pdf_path=join('./tables', f'{config_name}.pdf'), tabular_dir=f'tabular_{config_name}', landscape=False) + + pv = methods_results.pivot_table( + index='area', + columns='method', + values='AE', + aggfunc='mean', + margins=True, + margins_name='Mean' + ) + print(pv) diff --git a/result_path b/result_path new file mode 160000 index 0000000..816a4c6 --- /dev/null +++ b/result_path @@ -0,0 +1 @@ +Subproject commit 816a4c675e2919ea0ec4dd2ba9bf0d518d53dc17