import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from quapy.data import LabelledCollection from quapy.method.base import BaseQuantifier import quapy.functional as F from sklearn.preprocessing import StandardScaler np.set_printoptions(linewidth=np.inf) def load_csv(file, use_yhat=True): df = pd.read_csv(file) cod_area = 'cod.prov' if use_yhat: covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob'] else: covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob'] y_true = 'y.true' X = df[covariates].values A = df[cod_area].values for i, cov in enumerate(covariates): print(f'values of col {i} "{cov}" {np.unique(X[:,i])}') if y_true in df.columns: y = df[y_true].values return A, X, y else: return A, X def get_dataset_by_area(A, X, y=None): data = [] for area in np.unique(A): sel = (A == area) Xsel = X[sel] if y is not None: ysel = y[sel] else: ysel = None data.append((area, Xsel, ysel)) return data class AdjMatrix: def __init__(self, path, add_diagonal=False): df = pd.read_csv(path) area_codes = df.columns[1:].values area_codes = np.asarray([int(c) for c in area_codes]) values = df.values[:, 1:] print(area_codes) print(values) self.area2idx = {area:i for i, area in enumerate(area_codes)} self.idx2area = area_codes self.M = np.asarray(values, dtype=int) if add_diagonal: # adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when # the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when # the model is trained and tested on survey_y.csv self.M += np.eye(self.M.shape[0], dtype=int) def adjacent(self, cod_1, cod_2): idx1 = self.area2idx[cod_1] idx2 = self.area2idx[cod_2] return (self.M[idx1, idx2] == 1) def get_adjacent(self, cod): idx = self.area2idx[cod] idx_adj = np.argwhere(self.M[idx]==1).flatten() return self.idx2area[idx_adj] class Preprocessor: def __init__(self): self.scaler = StandardScaler() # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize self.standardize_col_ids = np.arange(8) # everything def fit(self, X, y=None): Xsel = X[:, self.standardize_col_ids] self.scaler.fit(Xsel) return self def transform(self, X): Xsel = X[:, self.standardize_col_ids] Xsel_zscore = self.scaler.transform(Xsel) X[:, self.standardize_col_ids] = Xsel_zscore return X def fit_transform(self, X, y=None): return self.fit(X, y).transform(X) def get_mean_std(self, column): mean = self.scaler.mean_[column] std = self.scaler.scale_[column] return mean, std class StatModel(BaseQuantifier): """ This method is a wrapper that simply returns the expected value of column "prob" as the prediction. The column "prob" comes from a different model used by our statiticians and is pre-computed, so this method actually simply reports the average. :param posteriors_column: index of the column "prob" in the csv. The default value is -1 since it is the last column either in survey_y.csv and cens_y.csv :param mean: indicates the mean of the column. If specified, then the column is assumed to be standardized, and the inverse function is applied in order to recover the posterior probability in the range [0,1] :param scale: indicates the scale of the column. If specified, then the column is assumed to be standardized, and the inverse function is applied in order to recover the posterior probability in the range [0,1] """ def __init__(self, posteriors_column=-1, mean=0, scale=1): self.posteriors_column = posteriors_column self.mean = mean self.scale = scale def fit(self, data: LabelledCollection): return self def quantify(self, instances): prob = instances[:, self.posteriors_column] # reconvert the z-scored variable to its original status prob = zscore_inv(prob, self.mean, self.scale) prob_ave = np.mean(prob) print('Model', prob_ave) prev = F.as_binary_prevalence(prob_ave) return prev def zscore_inv(X, mean, scale): return X*scale + mean class StatModelLR(BaseQuantifier): """ This method is a wrapper that recalibrates the column "prob" via Logistic Regression. The column "prob" comes from a different model used by our statiticians and is pre-computed. :param posteriors_column: index of the column "prob" in the csv. The default value is -1 since it is the last column either in survey_y.csv and cens_y.csv """ def __init__(self, posteriors_column=-1, mean=0, scale=1): self.posteriors_column = posteriors_column self.mean = mean self.scale = scale self.lr = LogisticRegressionCV() def fit(self, data: LabelledCollection): X = data.X[:,self.posteriors_column].reshape(-1,1) # reconvert the z-scored variable to its original status X = zscore_inv(X, self.mean, self.scale) y = data.y self.lr.fit(X, y) return self def quantify(self, instances): prob = instances[:, self.posteriors_column].reshape(-1,1) # reconvert the z-scored variable to its original status prob = zscore_inv(prob, self.mean, self.scale) calib_prob = self.lr.predict_proba(prob)[:,-1] prob_ave = np.mean(calib_prob) prev = F.as_binary_prevalence(prob_ave) return prev