QuaPy/Census/main.py

125 lines
3.1 KiB
Python
Raw Normal View History

2024-03-15 18:02:05 +01:00
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import LinearSVC
from tqdm import tqdm
import quapy as qp
from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS
from quapy.data import LabelledCollection
from sklearn.preprocessing import StandardScaler
np.set_printoptions(linewidth=np.inf)
cens_y = './data/cens_y.csv'
survey_y = './data/survey_y.csv'
def load_csv(file, use_yhat=True):
df = pd.read_csv(file)
cod_area = 'cod.prov'
if use_yhat:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
else:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
y_true = 'y.true'
X = df[covariates].values
A = df[cod_area].values
for i, cov in enumerate(covariates):
print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
if y_true in df.columns:
y = df[y_true].values
return A, X, y
else:
return A, X
def get_dataset_by_area(A, X, y=None):
lc = []
for area in np.unique(A):
sel = (A == area)
Xsel = X[sel]
if y is not None:
ysel = y[sel]
else:
ysel = None
lc.append((area, Xsel, ysel))
return lc
class Preprocessor:
def __init__(self):
self.scaler = StandardScaler()
# self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
self.standardize_col_ids = np.arange(8) # everything
def fit(self, X, y=None):
Xsel = X[:, self.standardize_col_ids]
self.scaler.fit(Xsel)
return self
def transform(self, X):
Xsel = X[:, self.standardize_col_ids]
Xsel_zscore = self.scaler.transform(Xsel)
X[:, self.standardize_col_ids] = Xsel_zscore
return X
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
# cls = LinearSVC()
cls = LogisticRegressionCV(class_weight='balanced', Cs=10)
q = CC(cls)
# q = PCC(cls)
# q = PACC(cls)
# q = EMQ(cls)
# q = MS(cls)
# Ate, Xte = load_csv(cens_y)
Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
# Xtr_proc = preprocessor.fit_transform(Xtr)
# big_train = LabelledCollection(Xtr_proc, ytr)
# q.fit(big_train)
trains = get_dataset_by_area(Atr, Xtr, ytr)
# tests = get_dataset_by_area(Ate, Xte)
n_area = len(trains)
results = np.zeros(shape=(n_area, n_area))
for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_area):
Xi = preprocessor.fit_transform(Xi)
tr = LabelledCollection(Xi, yi)
q.fit(tr)
len_tr = len(tr)
# len_tr = len(big_train)
for j, (Aj, Xj, yj) in enumerate(trains):
if i==j: continue
Xj = preprocessor.transform(Xj)
te = LabelledCollection(Xj, yj)
pred_prev = q.quantify(te.X)
true_prev = te.prevalence()
err = qp.error.mae(true_prev, pred_prev)
print(f'{i=} {j=} [#train={len_tr}] true_prev={true_prev[1]:.3f} pred_prev={pred_prev[1]:.3f} {err=:.4f}')
results[i,j] = err
print(results)
print(f'mean results = {results.mean():.4f}')