172 lines
5.8 KiB
Python
172 lines
5.8 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
|
|
|
|
from quapy.data import LabelledCollection
|
|
from quapy.method.base import BaseQuantifier
|
|
import quapy.functional as F
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
np.set_printoptions(linewidth=np.inf)
|
|
|
|
|
|
def load_csv(file, use_yhat=True):
|
|
df = pd.read_csv(file)
|
|
|
|
cod_area = 'cod.prov'
|
|
if use_yhat:
|
|
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
|
|
else:
|
|
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
|
|
y_true = 'y.true'
|
|
|
|
X = df[covariates].values
|
|
A = df[cod_area].values
|
|
|
|
for i, cov in enumerate(covariates):
|
|
print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
|
|
|
|
if y_true in df.columns:
|
|
y = df[y_true].values
|
|
return A, X, y
|
|
else:
|
|
return A, X
|
|
|
|
|
|
def get_dataset_by_area(A, X, y=None):
|
|
data = []
|
|
for area in np.unique(A):
|
|
sel = (A == area)
|
|
Xsel = X[sel]
|
|
if y is not None:
|
|
ysel = y[sel]
|
|
else:
|
|
ysel = None
|
|
data.append((area, Xsel, ysel))
|
|
return data
|
|
|
|
|
|
class AdjMatrix:
|
|
|
|
def __init__(self, path, add_diagonal=False):
|
|
df = pd.read_csv(path)
|
|
|
|
area_codes = df.columns[1:].values
|
|
area_codes = np.asarray([int(c) for c in area_codes])
|
|
|
|
values = df.values[:, 1:]
|
|
print(area_codes)
|
|
print(values)
|
|
self.area2idx = {area:i for i, area in enumerate(area_codes)}
|
|
self.idx2area = area_codes
|
|
self.M = np.asarray(values, dtype=int)
|
|
if add_diagonal:
|
|
# adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when
|
|
# the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when
|
|
# the model is trained and tested on survey_y.csv
|
|
self.M += np.eye(self.M.shape[0], dtype=int)
|
|
|
|
def adjacent(self, cod_1, cod_2):
|
|
idx1 = self.area2idx[cod_1]
|
|
idx2 = self.area2idx[cod_2]
|
|
return (self.M[idx1, idx2] == 1)
|
|
|
|
def get_adjacent(self, cod):
|
|
idx = self.area2idx[cod]
|
|
idx_adj = np.argwhere(self.M[idx]==1).flatten()
|
|
return self.idx2area[idx_adj]
|
|
|
|
|
|
class Preprocessor:
|
|
def __init__(self):
|
|
self.scaler = StandardScaler()
|
|
# self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
|
|
self.standardize_col_ids = np.arange(8) # everything
|
|
|
|
def fit(self, X, y=None):
|
|
Xsel = X[:, self.standardize_col_ids]
|
|
self.scaler.fit(Xsel)
|
|
return self
|
|
|
|
def transform(self, X):
|
|
Xsel = X[:, self.standardize_col_ids]
|
|
Xsel_zscore = self.scaler.transform(Xsel)
|
|
X[:, self.standardize_col_ids] = Xsel_zscore
|
|
return X
|
|
|
|
def fit_transform(self, X, y=None):
|
|
return self.fit(X, y).transform(X)
|
|
|
|
def get_mean_std(self, column):
|
|
mean = self.scaler.mean_[column]
|
|
std = self.scaler.scale_[column]
|
|
return mean, std
|
|
|
|
|
|
class StatModel(BaseQuantifier):
|
|
"""
|
|
This method is a wrapper that simply returns the expected value of column "prob" as the prediction.
|
|
The column "prob" comes from a different model used by our statiticians and is pre-computed, so this
|
|
method actually simply reports the average.
|
|
|
|
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
|
|
it is the last column either in survey_y.csv and cens_y.csv
|
|
:param mean: indicates the mean of the column. If specified, then the column is assumed to be
|
|
standardized, and the inverse function is applied in order to recover the posterior probability
|
|
in the range [0,1]
|
|
:param scale: indicates the scale of the column. If specified, then the column is assumed to be
|
|
standardized, and the inverse function is applied in order to recover the posterior probability
|
|
in the range [0,1]
|
|
"""
|
|
def __init__(self, posteriors_column=-1, mean=0, scale=1):
|
|
self.posteriors_column = posteriors_column
|
|
self.mean = mean
|
|
self.scale = scale
|
|
|
|
def fit(self, data: LabelledCollection):
|
|
return self
|
|
|
|
def quantify(self, instances):
|
|
prob = instances[:, self.posteriors_column]
|
|
# reconvert the z-scored variable to its original status
|
|
prob = zscore_inv(prob, self.mean, self.scale)
|
|
prob_ave = np.mean(prob)
|
|
print('Model', prob_ave)
|
|
prev = F.as_binary_prevalence(prob_ave)
|
|
return prev
|
|
|
|
|
|
def zscore_inv(X, mean, scale):
|
|
return X*scale + mean
|
|
|
|
|
|
class StatModelLR(BaseQuantifier):
|
|
"""
|
|
This method is a wrapper that recalibrates the column "prob" via Logistic Regression.
|
|
The column "prob" comes from a different model used by our statiticians and is pre-computed.
|
|
|
|
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
|
|
it is the last column either in survey_y.csv and cens_y.csv
|
|
"""
|
|
def __init__(self, posteriors_column=-1, mean=0, scale=1):
|
|
self.posteriors_column = posteriors_column
|
|
self.mean = mean
|
|
self.scale = scale
|
|
self.lr = LogisticRegressionCV()
|
|
|
|
def fit(self, data: LabelledCollection):
|
|
X = data.X[:,self.posteriors_column].reshape(-1,1)
|
|
# reconvert the z-scored variable to its original status
|
|
X = zscore_inv(X, self.mean, self.scale)
|
|
y = data.y
|
|
self.lr.fit(X, y)
|
|
return self
|
|
|
|
def quantify(self, instances):
|
|
prob = instances[:, self.posteriors_column].reshape(-1,1)
|
|
# reconvert the z-scored variable to its original status
|
|
prob = zscore_inv(prob, self.mean, self.scale)
|
|
calib_prob = self.lr.predict_proba(prob)[:,-1]
|
|
prob_ave = np.mean(calib_prob)
|
|
prev = F.as_binary_prevalence(prob_ave)
|
|
return prev |