diff --git a/Census/adjacentmedian_4.1.py b/Census/adjacentmedian_4.1.py index 339b509..1b0b9be 100644 --- a/Census/adjacentmedian_4.1.py +++ b/Census/adjacentmedian_4.1.py @@ -14,17 +14,21 @@ from copy import deepcopy np.set_printoptions(linewidth=np.inf) + def classifier(): return LogisticRegressionCV() + def quantifiers(): cls = classifier() yield 'MLPE', MLPE() - yield 'CC', CC(cls) + # yield 'CC', CC(cls) yield 'PCC', PCC(cls) - yield 'ACC', ACC(cls) - yield 'PACC', PACC(cls) - yield 'MS', MS(cls) + # yield 'ACC', ACC(cls) + # yield 'PACC', PACC(cls) + # yield 'MS', MS(cls) + yield 'SModelLR', StatModelLR() + yield 'SModel', StatModel(mean=prob_mean, scale=prob_std) # yield 'MS2', MS2(cls) # yield 'SLD', EMQ(cls) @@ -35,6 +39,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) preprocessor = Preprocessor() Xtr = preprocessor.fit_transform(Xtr) +prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum data = get_dataset_by_area(Atr, Xtr, ytr) n_areas = len(data) @@ -58,10 +63,8 @@ for aggr in ['median', 'mean']: table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local') table.format.mean_prec = 4 table.format.show_std = False - table.format.sta = False table.format.remove_zero = True - for q_name, q in quantifiers(): # pretrain quantifiers per area pretrained_area_q = [] diff --git a/Census/allmedian_3.1.py b/Census/allmedian_3.1.py index 7380f28..bc54555 100644 --- a/Census/allmedian_3.1.py +++ b/Census/allmedian_3.1.py @@ -25,6 +25,8 @@ def quantifiers(): yield 'ACC', ACC(cls) yield 'PACC', PACC(cls) yield 'SLD', EMQ(cls) + yield 'SModelLR', StatModelLR() + yield 'SModel', StatModel(mean=prob_mean, scale=prob_std) survey_y = './data/survey_y.csv' @@ -33,6 +35,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) preprocessor = Preprocessor() Xtr = preprocessor.fit_transform(Xtr) +prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum data = get_dataset_by_area(Atr, Xtr, ytr) n_areas = len(data) diff --git a/Census/commons.py b/Census/commons.py index 3d6118c..ee21be8 100644 --- a/Census/commons.py +++ b/Census/commons.py @@ -1,5 +1,10 @@ import numpy as np import pandas as pd +from sklearn.linear_model import LogisticRegressionCV, LogisticRegression + +from quapy.data import LabelledCollection +from quapy.method.base import BaseQuantifier +import quapy.functional as F from sklearn.preprocessing import StandardScaler np.set_printoptions(linewidth=np.inf) @@ -43,7 +48,7 @@ def get_dataset_by_area(A, X, y=None): class AdjMatrix: - def __init__(self, path): + def __init__(self, path, add_diagonal=False): df = pd.read_csv(path) area_codes = df.columns[1:].values @@ -54,7 +59,12 @@ class AdjMatrix: print(values) self.area2idx = {area:i for i, area in enumerate(area_codes)} self.idx2area = area_codes - self.M = np.asarray(values) + self.M = np.asarray(values, dtype=int) + if add_diagonal: + # adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when + # the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when + # the model is trained and tested on survey_y.csv + self.M += np.eye(self.M.shape[0], dtype=int) def adjacent(self, cod_1, cod_2): idx1 = self.area2idx[cod_1] @@ -87,4 +97,76 @@ class Preprocessor: def fit_transform(self, X, y=None): return self.fit(X, y).transform(X) + def get_mean_std(self, column): + mean = self.scaler.mean_[column] + std = self.scaler.scale_[column] + return mean, std + +class StatModel(BaseQuantifier): + """ + This method is a wrapper that simply returns the expected value of column "prob" as the prediction. + The column "prob" comes from a different model used by our statiticians and is pre-computed, so this + method actually simply reports the average. + + :param posteriors_column: index of the column "prob" in the csv. The default value is -1 since + it is the last column either in survey_y.csv and cens_y.csv + :param mean: indicates the mean of the column. If specified, then the column is assumed to be + standardized, and the inverse function is applied in order to recover the posterior probability + in the range [0,1] + :param scale: indicates the scale of the column. If specified, then the column is assumed to be + standardized, and the inverse function is applied in order to recover the posterior probability + in the range [0,1] + """ + def __init__(self, posteriors_column=-1, mean=0, scale=1): + self.posteriors_column = posteriors_column + self.mean = mean + self.scale = scale + + def fit(self, data: LabelledCollection): + return self + + def quantify(self, instances): + prob = instances[:, self.posteriors_column] + # reconvert the z-scored variable to its original status + prob = zscore_inv(prob, self.mean, self.scale) + prob_ave = np.mean(prob) + print('Model', prob_ave) + prev = F.as_binary_prevalence(prob_ave) + return prev + + +def zscore_inv(X, mean, scale): + return X*scale + mean + + +class StatModelLR(BaseQuantifier): + """ + This method is a wrapper that recalibrates the column "prob" via Logistic Regression. + The column "prob" comes from a different model used by our statiticians and is pre-computed. + + :param posteriors_column: index of the column "prob" in the csv. The default value is -1 since + it is the last column either in survey_y.csv and cens_y.csv + """ + def __init__(self, posteriors_column=-1, mean=0, scale=1): + self.posteriors_column = posteriors_column + self.mean = mean + self.scale = scale + self.lr = LogisticRegressionCV() + + def fit(self, data: LabelledCollection): + X = data.X[:,self.posteriors_column].reshape(-1,1) + # reconvert the z-scored variable to its original status + X = zscore_inv(X, self.mean, self.scale) + y = data.y + self.lr.fit(X, y) + return self + + def quantify(self, instances): + prob = instances[:, self.posteriors_column].reshape(-1,1) + # reconvert the z-scored variable to its original status + prob = zscore_inv(prob, self.mean, self.scale) + calib_prob = self.lr.predict_proba(prob)[:,-1] + prob_ave = np.mean(calib_prob) + prev = F.as_binary_prevalence(prob_ave) + return prev \ No newline at end of file diff --git a/Census/methods.py b/Census/methods.py index c67bb61..245984b 100644 --- a/Census/methods.py +++ b/Census/methods.py @@ -39,7 +39,6 @@ class CombinationRule(ABC): return prevalence - def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'): if hyper is None: hyper = { diff --git a/Census/pairwise_2.py b/Census/pairwise_2.py index e1708dd..db6fe9c 100644 --- a/Census/pairwise_2.py +++ b/Census/pairwise_2.py @@ -23,6 +23,8 @@ def quantifiers(): yield 'ACC', ACC(cls) yield 'PACC', PACC(cls) yield 'SLD', SLD(cls) + yield 'SModelLR', StatModelLR() + yield 'SModel', StatModel(mean=prob_mean, scale=prob_std) survey_y = './data/survey_y.csv' @@ -31,6 +33,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) preprocessor = Preprocessor() Xtr = preprocessor.fit_transform(Xtr) +prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum trains = get_dataset_by_area(Atr, Xtr, ytr) n_areas = len(trains) diff --git a/quapy/plot.py b/quapy/plot.py index cdc3bd5..78911ec 100644 --- a/quapy/plot.py +++ b/quapy/plot.py @@ -1,6 +1,6 @@ from collections import defaultdict import matplotlib.pyplot as plt -from matplotlib.cm import get_cmap +from matplotlib.pyplot import get_cmap import numpy as np from matplotlib import cm from scipy.stats import ttest_ind_from_stats