import os from pathlib import Path from sklearn.datasets import make_classification import numpy as np from quapy.data import LabelledCollection from quapy.protocol import UniformPrevalenceProtocol import quapy.functional as F import pandas as pd random_state = 0 n_features = 10 n_areas = 50 n_per_area = 1_000 population_size = n_areas * n_per_area n_experiments = 100 n_survey = population_size//n_experiments print(f'{n_features=}') print(f'{n_areas=}') print(f'{n_per_area=}') print(f'{population_size=}') print(f'{n_experiments=}') print(f'{n_survey=}') X, y = make_classification( n_samples=population_size * 100, n_features=n_features, n_informative=n_features//2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=[0.5, 0.5], flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=random_state) pool = LabelledCollection(X, y, classes=[0,1]) upp = UniformPrevalenceProtocol(pool, sample_size=n_per_area, repeats=n_areas, random_state=random_state, return_type='labelled_collection') data_X = [] data_y = [] data_area = [] experiment_selections = [] for area_id, area_sample in enumerate(upp()): print(f'{area_id=} has prevalence={F.strprev(area_sample.prevalence())}') data_X.append(area_sample.X) data_y.append(area_sample.y) data_area.append([area_id]*n_per_area) data_X = np.concatenate(data_X) data_y = np.concatenate(data_y) data_area = np.concatenate(data_area) assert len(data_area) == population_size, 'unexpected size!' idx = np.arange(population_size) rand_order = np.random.permutation(population_size) for experiment_id, offset_id in enumerate(range(0,population_size,n_survey)): experiment_sel = rand_order[offset_id:offset_id+n_survey] in_sample_id = np.zeros_like(data_area) in_sample_id[experiment_sel] = 1 experiment_selections.append(in_sample_id) # compose the dataframe data_dic = { 'ID': idx, 'Y': data_y, } for feat_id in range(n_features): data_dic[f'X_{feat_id}'] = data_X[:,feat_id] data_dic['area'] = data_area for experiment_id, experiment_selection in enumerate(experiment_selections): data_dic[f'InSample_{experiment_id}'] = experiment_selection df = pd.DataFrame(data_dic) data_path = f'./data/data_nF{n_features}_nA{n_areas}_P{population_size}_nExp{n_experiments}.csv' os.makedirs(Path(data_path).parent, exist_ok=True) df.to_csv(data_path, index=0)