experiments for report
This commit is contained in:
parent
f063e4f5dc
commit
ccae7746ce
|
|
@ -0,0 +1,3 @@
|
|||
[submodule "result_path"]
|
||||
path = result_path
|
||||
url = gitea@gitea-s2i2s.isti.cnr.it:moreo/result_table.git
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataConfig:
|
||||
n_features: int
|
||||
n_informative: int
|
||||
n_redundant: int
|
||||
n_clusters_per_class: int
|
||||
flip_y:float
|
||||
|
||||
config_easy = DataConfig(n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, flip_y=0.0)
|
||||
config_medium = DataConfig(n_features=10, n_informative=5, n_redundant=2, n_clusters_per_class=2, flip_y=0.01)
|
||||
config_hard = DataConfig(n_features=50, n_informative=20, n_redundant=10, n_clusters_per_class=4, flip_y=0.05)
|
||||
config_veryhard = DataConfig(n_features=100, n_informative=30, n_redundant=20, n_clusters_per_class=8, flip_y=0.05)
|
||||
|
||||
configs = [
|
||||
config_easy,
|
||||
config_medium,
|
||||
config_hard,
|
||||
config_veryhard
|
||||
]
|
||||
|
|
@ -6,85 +6,88 @@ from quapy.data import LabelledCollection
|
|||
from quapy.protocol import UniformPrevalenceProtocol
|
||||
import quapy.functional as F
|
||||
import pandas as pd
|
||||
from commons import configs
|
||||
|
||||
random_state = 0
|
||||
|
||||
n_features = 10
|
||||
|
||||
n_areas = 50
|
||||
n_per_area = 1_000
|
||||
population_size = n_areas * n_per_area
|
||||
n_experiments = 100
|
||||
n_survey = population_size//n_experiments
|
||||
|
||||
print(f'{n_features=}')
|
||||
print(f'{n_areas=}')
|
||||
print(f'{n_per_area=}')
|
||||
print(f'{population_size=}')
|
||||
print(f'{n_experiments=}')
|
||||
print(f'{n_survey=}')
|
||||
|
||||
X, y = make_classification(
|
||||
n_samples=population_size * 100,
|
||||
n_features=n_features,
|
||||
n_informative=n_features//2,
|
||||
n_redundant=2,
|
||||
n_repeated=0,
|
||||
n_classes=2,
|
||||
n_clusters_per_class=2,
|
||||
weights=[0.5, 0.5],
|
||||
flip_y=0.01,
|
||||
class_sep=1.0,
|
||||
hypercube=True,
|
||||
shift=0.0,
|
||||
scale=1.0,
|
||||
shuffle=True,
|
||||
random_state=random_state)
|
||||
for config in configs:
|
||||
print(f'{config.n_features=}')
|
||||
print(f'{n_areas=}')
|
||||
print(f'{n_per_area=}')
|
||||
print(f'{population_size=}')
|
||||
print(f'{n_experiments=}')
|
||||
print(f'{n_survey=}')
|
||||
|
||||
pool = LabelledCollection(X, y, classes=[0,1])
|
||||
upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection')
|
||||
X, y = make_classification(
|
||||
n_samples=population_size * 100,
|
||||
n_features=config.n_features,
|
||||
n_informative=config.n_informative,
|
||||
n_redundant=config.n_redundant,
|
||||
n_repeated=0,
|
||||
n_classes=2,
|
||||
n_clusters_per_class=config.n_clusters_per_class,
|
||||
weights=[0.5, 0.5],
|
||||
flip_y=config.flip_y,
|
||||
class_sep=1.0,
|
||||
hypercube=True,
|
||||
shift=0.0,
|
||||
scale=1.0,
|
||||
shuffle=True,
|
||||
random_state=random_state)
|
||||
|
||||
data_X = []
|
||||
data_y = []
|
||||
data_area = []
|
||||
experiment_selections = []
|
||||
pool = LabelledCollection(X, y, classes=[0,1])
|
||||
upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection')
|
||||
|
||||
for area_id, area_sample in enumerate(upp()):
|
||||
print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}')
|
||||
data_X.append(area_sample.X)
|
||||
data_y.append(area_sample.y)
|
||||
data_area.append([area_id]*n_per_area)
|
||||
data_X = []
|
||||
data_y = []
|
||||
data_area = []
|
||||
experiment_selections = []
|
||||
|
||||
data_X = np.concatenate(data_X)
|
||||
data_y = np.concatenate(data_y)
|
||||
data_area = np.concatenate(data_area)
|
||||
for area_id, area_sample in enumerate(upp()):
|
||||
print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}')
|
||||
data_X.append(area_sample.X)
|
||||
data_y.append(area_sample.y)
|
||||
data_area.append([area_id]*n_per_area)
|
||||
|
||||
assert len(data_area) == population_size, 'unexpected size!'
|
||||
data_X = np.concatenate(data_X)
|
||||
data_y = np.concatenate(data_y)
|
||||
data_area = np.concatenate(data_area)
|
||||
|
||||
idx = np.arange(population_size)
|
||||
rand_order = np.random.permutation(population_size)
|
||||
for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)):
|
||||
experiment_sel = rand_order[offset_id:offset_id+n_survey]
|
||||
in_sample_id = np.zeros_like(data_area)
|
||||
in_sample_id[experiment_sel] = 1
|
||||
experiment_selections.append(in_sample_id)
|
||||
assert len(data_area) == population_size, 'unexpected size!'
|
||||
|
||||
# compose the dataframe
|
||||
data_dic = {
|
||||
'ID': idx,
|
||||
'Y': data_y,
|
||||
}
|
||||
for feat_id in range(n_features):
|
||||
data_dic[f'X_{feat_id}'] = data_X[:,feat_id]
|
||||
data_dic['area'] = data_area
|
||||
idx = np.arange(population_size)
|
||||
rand_order = np.random.permutation(population_size)
|
||||
for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)):
|
||||
experiment_sel = rand_order[offset_id:offset_id+n_survey]
|
||||
in_sample_id = np.zeros_like(data_area)
|
||||
in_sample_id[experiment_sel] = 1
|
||||
experiment_selections.append(in_sample_id)
|
||||
|
||||
for experiment_id, experiment_selection in enumerate(experiment_selections):
|
||||
data_dic[f'InSample_{experiment_id}'] = experiment_selection
|
||||
# compose the dataframe
|
||||
data_dic = {
|
||||
'ID': idx,
|
||||
'Y': data_y,
|
||||
}
|
||||
for feat_id in range(config.n_features):
|
||||
data_dic[f'X_{feat_id}'] = data_X[:,feat_id]
|
||||
data_dic['area'] = data_area
|
||||
|
||||
df = pd.DataFrame(data_dic)
|
||||
for experiment_id, experiment_selection in enumerate(experiment_selections):
|
||||
data_dic[f'InSample_{experiment_id}'] = experiment_selection
|
||||
|
||||
data_path = f'./data/data_nF{n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv'
|
||||
os.makedirs(Path(data_path).parent, exist_ok=True)
|
||||
df.to_csv(data_path, index=0)
|
||||
df = pd.DataFrame(data_dic)
|
||||
|
||||
data_path = f'./data/data_nF{config.n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv'
|
||||
os.makedirs(Path(data_path).parent, exist_ok=True)
|
||||
df.to_csv(data_path, index=0)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -3,15 +3,14 @@ from os.path import join
|
|||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
from pathlib import Path
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.protocol import APP
|
||||
from quapy.method.aggregative import PACC, PCC, EMQ, DMy, ACC, KDEyML, CC
|
||||
import quapy.functional as F
|
||||
from tqdm import tqdm
|
||||
from commons import configs
|
||||
|
||||
from src.new_table import LatexTable
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
|
@ -42,80 +41,106 @@ def methods():
|
|||
yield 'PACC', PACC(classifier=LogisticRegression())
|
||||
yield 'EMQ', EMQ(classifier=LogisticRegression())
|
||||
yield 'KDEy', KDEyML(classifier=LogisticRegression(), bandwidth=0.05)
|
||||
yield 'KDEy01', KDEyML(classifier=LogisticRegression())
|
||||
# yield 'KDEy01', KDEyML(classifier=LogisticRegression())
|
||||
|
||||
|
||||
data_path = './data/data_nF10_nA50_P50000_nExp100.csv'
|
||||
for config in configs:
|
||||
|
||||
config = Path(data_path).name.replace('.csv','')
|
||||
result_dir = f'./results/{config}'
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
print(f'Running {config}')
|
||||
|
||||
X, y, A, numExperiments, df = load_data(data_path)
|
||||
config_name = f'data_nF{config.n_features}_nA50_P50000_nExp100'
|
||||
data_path = f'./data/{config_name}.csv'
|
||||
|
||||
areas = sorted(np.unique(A))
|
||||
n_areas = len(areas)
|
||||
result_dir = f'./results/{config_name}'
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
|
||||
methods_results = []
|
||||
X, y, A, numExperiments, df = load_data(data_path)
|
||||
|
||||
for q_name, quantifier in methods():
|
||||
areas = sorted(np.unique(A))
|
||||
n_areas = len(areas)
|
||||
|
||||
result_path = join(result_dir, f'{q_name}.csv')
|
||||
if os.path.exists(result_path):
|
||||
method_results = pd.read_csv(result_path, index_col=0)
|
||||
methods_results = []
|
||||
|
||||
# load baseline result from UniPI
|
||||
baseline_path = join(result_dir, 'Risultati_SAE.csv')
|
||||
if os.path.exists(baseline_path):
|
||||
unipi_baseline_df = pd.read_csv(baseline_path, index_col=0, sep=';')
|
||||
unipi_baseline_df = unipi_baseline_df.rename(columns={'AE(SAE)': 'AE'})
|
||||
unipi_baseline_name = "SAE"
|
||||
methods_results.append(unipi_baseline_df)
|
||||
else:
|
||||
results = []
|
||||
pbar = tqdm(range(numExperiments), total=numExperiments)
|
||||
for experiment_id in pbar:
|
||||
pbar.set_description(f'q_name={q_name}')
|
||||
in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool)
|
||||
unipi_baseline_name = None
|
||||
|
||||
Xtr = X[in_sample]
|
||||
ytr = y[in_sample]
|
||||
Atr = A[in_sample]
|
||||
# run quantification methods
|
||||
for q_name, quantifier in methods():
|
||||
result_path = join(result_dir, f'{q_name}.csv')
|
||||
if os.path.exists(result_path):
|
||||
method_results = pd.read_csv(result_path, index_col=0)
|
||||
else:
|
||||
results = []
|
||||
pbar = tqdm(range(numExperiments), total=numExperiments)
|
||||
for experiment_id in pbar:
|
||||
pbar.set_description(f'q_name={q_name}')
|
||||
in_sample = df[f'InSample_{experiment_id}'].values.astype(dtype=bool)
|
||||
|
||||
# Xte = X[~in_sample]
|
||||
# yte = y[~in_sample]
|
||||
# Ate = A[~in_sample]
|
||||
Xtr = X[in_sample]
|
||||
ytr = y[in_sample]
|
||||
Atr = A[in_sample]
|
||||
|
||||
Xte = X
|
||||
yte = y
|
||||
Ate = A
|
||||
# Xte = X[~in_sample]
|
||||
# yte = y[~in_sample]
|
||||
# Ate = A[~in_sample]
|
||||
|
||||
train = LabelledCollection(Xtr, ytr, classes=[0, 1])
|
||||
quantifier.fit(train)
|
||||
Xte = X
|
||||
yte = y
|
||||
Ate = A
|
||||
|
||||
for area in areas:
|
||||
sel_te_a = Ate == area
|
||||
test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1])
|
||||
train = LabelledCollection(Xtr, ytr, classes=[0, 1])
|
||||
quantifier.fit(train)
|
||||
|
||||
pred_prev = quantifier.quantify(test_A.X)[1]
|
||||
true_prev = test_A.prevalence()[1]
|
||||
ae = abs(pred_prev-true_prev)
|
||||
for area in areas:
|
||||
sel_te_a = Ate == area
|
||||
test_A = LabelledCollection(Xte[sel_te_a], yte[sel_te_a], classes=[0,1])
|
||||
|
||||
results.append({
|
||||
'experiment_id': experiment_id,
|
||||
'area': area,
|
||||
'method': q_name,
|
||||
'true-prev': true_prev,
|
||||
'estim-prev': pred_prev,
|
||||
'AE': ae
|
||||
})
|
||||
pred_prev = quantifier.quantify(test_A.X)[1]
|
||||
true_prev = test_A.prevalence()[1]
|
||||
ae = abs(pred_prev-true_prev)
|
||||
|
||||
method_results = pd.DataFrame(results)
|
||||
method_results.to_csv(result_path, index=0)
|
||||
methods_results.append(method_results)
|
||||
results.append({
|
||||
'experiment_id': experiment_id,
|
||||
'area': area,
|
||||
'method': q_name,
|
||||
'true-prev': true_prev,
|
||||
'estim-prev': pred_prev,
|
||||
'AE': ae
|
||||
})
|
||||
|
||||
methods_results = pd.concat(methods_results)
|
||||
pv = methods_results.pivot_table(
|
||||
index='area',
|
||||
columns='method',
|
||||
values='AE',
|
||||
aggfunc='mean',
|
||||
margins=True,
|
||||
margins_name='Mean'
|
||||
)
|
||||
print(pv)
|
||||
method_results = pd.DataFrame(results)
|
||||
method_results.to_csv(result_path, index=0)
|
||||
methods_results.append(method_results)
|
||||
|
||||
methods_results = pd.concat(methods_results)
|
||||
|
||||
methods_results["area"] = methods_results["area"].astype(str).str.zfill(2)
|
||||
latex_table = LatexTable.from_dataframe(methods_results, method='method', benchmark='area', value='AE')
|
||||
latex_table.format.configuration.resizebox=True
|
||||
|
||||
methods_order = [m for m, _ in methods()]
|
||||
if unipi_baseline_name is not None:
|
||||
methods_order = [unipi_baseline_name] + methods_order
|
||||
|
||||
latex_table.reorder_methods(methods_order)
|
||||
latex_table.latexPDF(pdf_path=join('./tables', f'{config_name}.pdf'), tabular_dir=f'tabular_{config_name}', landscape=False)
|
||||
|
||||
pv = methods_results.pivot_table(
|
||||
index='area',
|
||||
columns='method',
|
||||
values='AE',
|
||||
aggfunc='mean',
|
||||
margins=True,
|
||||
margins_name='Mean'
|
||||
)
|
||||
print(pv)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
Subproject commit 816a4c675e2919ea0ec4dd2ba9bf0d518d53dc17
|
||||
Loading…
Reference in New Issue