QuaPy/Census/simulated-cnr/gen_data.py

92 lines
2.4 KiB
Python

import os
from pathlib import Path
from sklearn.datasets import make_classification
import numpy as np
from quapy.data import LabelledCollection
from quapy.protocol import UniformPrevalenceProtocol
import quapy.functional as F
import pandas as pd
random_state = 0
n_features = 10
n_areas = 50
n_per_area = 1_000
population_size = n_areas * n_per_area
n_experiments = 100
n_survey = population_size//n_experiments
print(f'{n_features=}')
print(f'{n_areas=}')
print(f'{n_per_area=}')
print(f'{population_size=}')
print(f'{n_experiments=}')
print(f'{n_survey=}')
X, y = make_classification(
n_samples=population_size * 100,
n_features=n_features,
n_informative=n_features//2,
n_redundant=2,
n_repeated=0,
n_classes=2,
n_clusters_per_class=2,
weights=[0.5, 0.5],
flip_y=0.01,
class_sep=1.0,
hypercube=True,
shift=0.0,
scale=1.0,
shuffle=True,
random_state=random_state)
pool = LabelledCollection(X, y, classes=[0,1])
upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection')
data_X = []
data_y = []
data_area = []
experiment_selections = []
for area_id, area_sample in enumerate(upp()):
print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}')
data_X.append(area_sample.X)
data_y.append(area_sample.y)
data_area.append([area_id]*n_per_area)
data_X = np.concatenate(data_X)
data_y = np.concatenate(data_y)
data_area = np.concatenate(data_area)
assert len(data_area) == population_size, 'unexpected size!'
idx = np.arange(population_size)
rand_order = np.random.permutation(population_size)
for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)):
experiment_sel = rand_order[offset_id:offset_id+n_survey]
in_sample_id = np.zeros_like(data_area)
in_sample_id[experiment_sel] = 1
experiment_selections.append(in_sample_id)
# compose the dataframe
data_dic = {
'ID': idx,
'Y': data_y,
}
for feat_id in range(n_features):
data_dic[f'X_{feat_id}'] = data_X[:,feat_id]
data_dic['area'] = data_area
for experiment_id, experiment_selection in enumerate(experiment_selections):
data_dic[f'InSample_{experiment_id}'] = experiment_selection
df = pd.DataFrame(data_dic)
data_path = f'./data/data_nF{n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv'
os.makedirs(Path(data_path).parent, exist_ok=True)
df.to_csv(data_path, index=0)