QuaPy/Census/commons.py

172 lines
5.8 KiB
Python

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier
import quapy.functional as F
from sklearn.preprocessing import StandardScaler
np.set_printoptions(linewidth=np.inf)
def load_csv(file, use_yhat=True):
df = pd.read_csv(file)
cod_area = 'cod.prov'
if use_yhat:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
else:
covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
y_true = 'y.true'
X = df[covariates].values
A = df[cod_area].values
for i, cov in enumerate(covariates):
print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
if y_true in df.columns:
y = df[y_true].values
return A, X, y
else:
return A, X
def get_dataset_by_area(A, X, y=None):
data = []
for area in np.unique(A):
sel = (A == area)
Xsel = X[sel]
if y is not None:
ysel = y[sel]
else:
ysel = None
data.append((area, Xsel, ysel))
return data
class AdjMatrix:
def __init__(self, path, add_diagonal=False):
df = pd.read_csv(path)
area_codes = df.columns[1:].values
area_codes = np.asarray([int(c) for c in area_codes])
values = df.values[:, 1:]
print(area_codes)
print(values)
self.area2idx = {area:i for i, area in enumerate(area_codes)}
self.idx2area = area_codes
self.M = np.asarray(values, dtype=int)
if add_diagonal:
# adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when
# the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when
# the model is trained and tested on survey_y.csv
self.M += np.eye(self.M.shape[0], dtype=int)
def adjacent(self, cod_1, cod_2):
idx1 = self.area2idx[cod_1]
idx2 = self.area2idx[cod_2]
return (self.M[idx1, idx2] == 1)
def get_adjacent(self, cod):
idx = self.area2idx[cod]
idx_adj = np.argwhere(self.M[idx]==1).flatten()
return self.idx2area[idx_adj]
class Preprocessor:
def __init__(self):
self.scaler = StandardScaler()
# self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
self.standardize_col_ids = np.arange(8) # everything
def fit(self, X, y=None):
Xsel = X[:, self.standardize_col_ids]
self.scaler.fit(Xsel)
return self
def transform(self, X):
Xsel = X[:, self.standardize_col_ids]
Xsel_zscore = self.scaler.transform(Xsel)
X[:, self.standardize_col_ids] = Xsel_zscore
return X
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
def get_mean_std(self, column):
mean = self.scaler.mean_[column]
std = self.scaler.scale_[column]
return mean, std
class StatModel(BaseQuantifier):
"""
This method is a wrapper that simply returns the expected value of column "prob" as the prediction.
The column "prob" comes from a different model used by our statiticians and is pre-computed, so this
method actually simply reports the average.
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
it is the last column either in survey_y.csv and cens_y.csv
:param mean: indicates the mean of the column. If specified, then the column is assumed to be
standardized, and the inverse function is applied in order to recover the posterior probability
in the range [0,1]
:param scale: indicates the scale of the column. If specified, then the column is assumed to be
standardized, and the inverse function is applied in order to recover the posterior probability
in the range [0,1]
"""
def __init__(self, posteriors_column=-1, mean=0, scale=1):
self.posteriors_column = posteriors_column
self.mean = mean
self.scale = scale
def fit(self, data: LabelledCollection):
return self
def quantify(self, instances):
prob = instances[:, self.posteriors_column]
# reconvert the z-scored variable to its original status
prob = zscore_inv(prob, self.mean, self.scale)
prob_ave = np.mean(prob)
print('Model', prob_ave)
prev = F.as_binary_prevalence(prob_ave)
return prev
def zscore_inv(X, mean, scale):
return X*scale + mean
class StatModelLR(BaseQuantifier):
"""
This method is a wrapper that recalibrates the column "prob" via Logistic Regression.
The column "prob" comes from a different model used by our statiticians and is pre-computed.
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
it is the last column either in survey_y.csv and cens_y.csv
"""
def __init__(self, posteriors_column=-1, mean=0, scale=1):
self.posteriors_column = posteriors_column
self.mean = mean
self.scale = scale
self.lr = LogisticRegressionCV()
def fit(self, data: LabelledCollection):
X = data.X[:,self.posteriors_column].reshape(-1,1)
# reconvert the z-scored variable to its original status
X = zscore_inv(X, self.mean, self.scale)
y = data.y
self.lr.fit(X, y)
return self
def quantify(self, instances):
prob = instances[:, self.posteriors_column].reshape(-1,1)
# reconvert the z-scored variable to its original status
prob = zscore_inv(prob, self.mean, self.scale)
calib_prob = self.lr.predict_proba(prob)[:,-1]
prob_ave = np.mean(calib_prob)
prev = F.as_binary_prevalence(prob_ave)
return prev