92 lines
2.4 KiB
Python
92 lines
2.4 KiB
Python
import os
|
|
from pathlib import Path
|
|
from sklearn.datasets import make_classification
|
|
import numpy as np
|
|
from quapy.data import LabelledCollection
|
|
from quapy.protocol import UniformPrevalenceProtocol
|
|
import quapy.functional as F
|
|
import pandas as pd
|
|
|
|
random_state = 0
|
|
|
|
n_features = 10
|
|
n_areas = 50
|
|
n_per_area = 1_000
|
|
population_size = n_areas * n_per_area
|
|
n_experiments = 100
|
|
n_survey = population_size//n_experiments
|
|
|
|
print(f'{n_features=}')
|
|
print(f'{n_areas=}')
|
|
print(f'{n_per_area=}')
|
|
print(f'{population_size=}')
|
|
print(f'{n_experiments=}')
|
|
print(f'{n_survey=}')
|
|
|
|
X, y = make_classification(
|
|
n_samples=population_size * 100,
|
|
n_features=n_features,
|
|
n_informative=n_features//2,
|
|
n_redundant=2,
|
|
n_repeated=0,
|
|
n_classes=2,
|
|
n_clusters_per_class=2,
|
|
weights=[0.5, 0.5],
|
|
flip_y=0.01,
|
|
class_sep=1.0,
|
|
hypercube=True,
|
|
shift=0.0,
|
|
scale=1.0,
|
|
shuffle=True,
|
|
random_state=random_state)
|
|
|
|
pool = LabelledCollection(X, y, classes=[0,1])
|
|
upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection')
|
|
|
|
data_X = []
|
|
data_y = []
|
|
data_area = []
|
|
experiment_selections = []
|
|
|
|
for area_id, area_sample in enumerate(upp()):
|
|
print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}')
|
|
data_X.append(area_sample.X)
|
|
data_y.append(area_sample.y)
|
|
data_area.append([area_id]*n_per_area)
|
|
|
|
data_X = np.concatenate(data_X)
|
|
data_y = np.concatenate(data_y)
|
|
data_area = np.concatenate(data_area)
|
|
|
|
assert len(data_area) == population_size, 'unexpected size!'
|
|
|
|
idx = np.arange(population_size)
|
|
rand_order = np.random.permutation(population_size)
|
|
for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)):
|
|
experiment_sel = rand_order[offset_id:offset_id+n_survey]
|
|
in_sample_id = np.zeros_like(data_area)
|
|
in_sample_id[experiment_sel] = 1
|
|
experiment_selections.append(in_sample_id)
|
|
|
|
# compose the dataframe
|
|
data_dic = {
|
|
'ID': idx,
|
|
'Y': data_y,
|
|
}
|
|
for feat_id in range(n_features):
|
|
data_dic[f'X_{feat_id}'] = data_X[:,feat_id]
|
|
data_dic['area'] = data_area
|
|
|
|
for experiment_id, experiment_selection in enumerate(experiment_selections):
|
|
data_dic[f'InSample_{experiment_id}'] = experiment_selection
|
|
|
|
df = pd.DataFrame(data_dic)
|
|
|
|
data_path = f'./data/data_nF{n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv'
|
|
os.makedirs(Path(data_path).parent, exist_ok=True)
|
|
df.to_csv(data_path, index=0)
|
|
|
|
|
|
|
|
|