QuaPy/Census/simulated-cnr/gen_data.py

import os
from pathlib import Path
from sklearn.datasets import make_classification
import numpy as np
from quapy.data import LabelledCollection
from quapy.protocol import UniformPrevalenceProtocol
import quapy.functional as F
import pandas as pd

random_state = 0

n_features = 10
n_areas = 50
n_per_area = 1_000
population_size = n_areas * n_per_area
n_experiments = 100
n_survey = population_size//n_experiments

print(f'{n_features=}')
print(f'{n_areas=}')
print(f'{n_per_area=}')
print(f'{population_size=}')
print(f'{n_experiments=}')
print(f'{n_survey=}')

X, y = make_classification(
    n_samples=population_size * 100,
    n_features=n_features,
    n_informative=n_features//2,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=2,
    weights=[0.5, 0.5],
    flip_y=0.01,
    class_sep=1.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=random_state)

pool = LabelledCollection(X, y, classes=[0,1])
upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection')

data_X = []
data_y = []
data_area = []
experiment_selections = []

for area_id, area_sample in enumerate(upp()):
    print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}')
    data_X.append(area_sample.X)
    data_y.append(area_sample.y)
    data_area.append([area_id]*n_per_area)

data_X = np.concatenate(data_X)
data_y = np.concatenate(data_y)
data_area = np.concatenate(data_area)

assert len(data_area) == population_size, 'unexpected size!'

idx = np.arange(population_size)
rand_order = np.random.permutation(population_size)
for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)):
    experiment_sel = rand_order[offset_id:offset_id+n_survey]
    in_sample_id = np.zeros_like(data_area)
    in_sample_id[experiment_sel] = 1
    experiment_selections.append(in_sample_id)

# compose the dataframe
data_dic = {
    'ID': idx,
    'Y': data_y,
}
for feat_id in range(n_features):
    data_dic[f'X_{feat_id}'] = data_X[:,feat_id]
data_dic['area'] = data_area

for experiment_id, experiment_selection in enumerate(experiment_selections):
    data_dic[f'InSample_{experiment_id}'] = experiment_selection

df = pd.DataFrame(data_dic)

data_path = f'./data/data_nF{n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv'
os.makedirs(Path(data_path).parent, exist_ok=True)
df.to_csv(data_path, index=0)