QuaPy/Census/commons.py

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier
import quapy.functional as F
from sklearn.preprocessing import StandardScaler

np.set_printoptions(linewidth=np.inf)


def load_csv(file, use_yhat=True):
    df = pd.read_csv(file)

    cod_area = 'cod.prov'
    if use_yhat:
        covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
    else:
        covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
    y_true = 'y.true'

    X = df[covariates].values
    A = df[cod_area].values

    for i, cov in enumerate(covariates):
        print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')

    if y_true in df.columns:
        y = df[y_true].values
        return A, X, y
    else:
        return A, X


def get_dataset_by_area(A, X, y=None):
    data = []
    for area in np.unique(A):
        sel = (A == area)
        Xsel = X[sel]
        if y is not None:
            ysel = y[sel]
        else:
            ysel = None
        data.append((area, Xsel, ysel))
    return data


class AdjMatrix:

    def __init__(self, path, add_diagonal=False):
        df = pd.read_csv(path)

        area_codes = df.columns[1:].values
        area_codes = np.asarray([int(c) for c in area_codes])

        values = df.values[:, 1:]
        print(area_codes)
        print(values)
        self.area2idx = {area:i for i, area in enumerate(area_codes)}
        self.idx2area = area_codes
        self.M = np.asarray(values, dtype=int)
        if add_diagonal:
            # adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when
            # the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when
            # the model is trained and tested on survey_y.csv
            self.M += np.eye(self.M.shape[0], dtype=int)

    def adjacent(self, cod_1, cod_2):
        idx1 = self.area2idx[cod_1]
        idx2 = self.area2idx[cod_2]
        return (self.M[idx1, idx2] == 1)

    def get_adjacent(self, cod):
        idx = self.area2idx[cod]
        idx_adj = np.argwhere(self.M[idx]==1).flatten()
        return self.idx2area[idx_adj]


class Preprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
        self.standardize_col_ids = np.arange(8) # everything

    def fit(self, X, y=None):
        Xsel = X[:, self.standardize_col_ids]
        self.scaler.fit(Xsel)
        return self

    def transform(self, X):
        Xsel = X[:, self.standardize_col_ids]
        Xsel_zscore = self.scaler.transform(Xsel)
        X[:, self.standardize_col_ids] = Xsel_zscore
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

    def get_mean_std(self, column):
        mean = self.scaler.mean_[column]
        std  = self.scaler.scale_[column]
        return mean, std


class StatModel(BaseQuantifier):
    """
    This method is a wrapper that simply returns the expected value of column "prob" as the prediction.
    The column "prob" comes from a different model used by our statiticians and is pre-computed, so this
    method actually simply reports the average.

    :param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
        it is the last column either in survey_y.csv and cens_y.csv
    :param mean: indicates the mean of the column. If specified, then the column is assumed to be
        standardized, and the inverse function is applied in order to recover the posterior probability
        in the range [0,1]
    :param scale: indicates the scale of the column. If specified, then the column is assumed to be
        standardized, and the inverse function is applied in order to recover the posterior probability
        in the range [0,1]
    """
    def __init__(self, posteriors_column=-1, mean=0, scale=1):
        self.posteriors_column = posteriors_column
        self.mean = mean
        self.scale = scale

    def fit(self, data: LabelledCollection):
        return self

    def quantify(self, instances):
        prob = instances[:, self.posteriors_column]
        # reconvert the z-scored variable to its original status
        prob = zscore_inv(prob, self.mean, self.scale)
        prob_ave = np.mean(prob)
        print('Model', prob_ave)
        prev = F.as_binary_prevalence(prob_ave)
        return prev


def zscore_inv(X, mean, scale):
    return X*scale + mean


class StatModelLR(BaseQuantifier):
    """
    This method is a wrapper that recalibrates the column "prob" via Logistic Regression.
    The column "prob" comes from a different model used by our statiticians and is pre-computed.

    :param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
        it is the last column either in survey_y.csv and cens_y.csv
    """
    def __init__(self, posteriors_column=-1, mean=0, scale=1):
        self.posteriors_column = posteriors_column
        self.mean = mean
        self.scale = scale
        self.lr = LogisticRegressionCV()

    def fit(self, data: LabelledCollection):
        X = data.X[:,self.posteriors_column].reshape(-1,1)
        # reconvert the z-scored variable to its original status
        X = zscore_inv(X, self.mean, self.scale)
        y = data.y
        self.lr.fit(X, y)
        return self

    def quantify(self, instances):
        prob = instances[:, self.posteriors_column].reshape(-1,1)
        # reconvert the z-scored variable to its original status
        prob = zscore_inv(prob, self.mean, self.scale)
        calib_prob = self.lr.predict_proba(prob)[:,-1]
        prob_ave = np.mean(calib_prob)
        prev = F.as_binary_prevalence(prob_ave)
        return prev