last experiments before the meeting with unipi

2024-06-18 11:14:40 +02:00 · 2024-06-18 11:14:40 +02:00 · 4f3a6a4169
parent 21d052313c
commit 4f3a6a4169
6 changed files with 100 additions and 10 deletions
--- a/Census/adjacentmedian_4.1.py
+++ b/Census/adjacentmedian_4.1.py
@ -14,17 +14,21 @@ from copy import deepcopy

 np.set_printoptions(linewidth=np.inf)

+
 def classifier():
    return LogisticRegressionCV()

+
 def quantifiers():
    cls = classifier()
    yield 'MLPE', MLPE()
-    yield 'CC', CC(cls)
+    # yield 'CC', CC(cls)
    yield 'PCC', PCC(cls)
-    yield 'ACC', ACC(cls)
-    yield 'PACC', PACC(cls)
-    yield 'MS', MS(cls)
+    # yield 'ACC', ACC(cls)
+    # yield 'PACC', PACC(cls)
+    # yield 'MS', MS(cls)
+    yield 'SModelLR', StatModelLR()
+    yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
    # yield 'MS2', MS2(cls)
    # yield 'SLD', EMQ(cls)

@ -35,6 +39,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)

 preprocessor = Preprocessor()
 Xtr = preprocessor.fit_transform(Xtr)
+prob_mean, prob_std = preprocessor.get_mean_std(column=-1)  # get the mean and std of the "prob" colum

 data = get_dataset_by_area(Atr, Xtr, ytr)
 n_areas = len(data)
@ -58,10 +63,8 @@ for aggr in ['median', 'mean']:
    table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
    table.format.mean_prec = 4
    table.format.show_std = False
-    table.format.sta = False
    table.format.remove_zero = True

-
    for q_name, q in quantifiers():
        # pretrain quantifiers per area
        pretrained_area_q = []
--- a/Census/allmedian_3.1.py
+++ b/Census/allmedian_3.1.py
@ -25,6 +25,8 @@ def quantifiers():
    yield 'ACC', ACC(cls)
    yield 'PACC', PACC(cls)
    yield 'SLD', EMQ(cls)
+    yield 'SModelLR', StatModelLR()
+    yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)


 survey_y = './data/survey_y.csv'
@ -33,6 +35,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)

 preprocessor = Preprocessor()
 Xtr = preprocessor.fit_transform(Xtr)
+prob_mean, prob_std = preprocessor.get_mean_std(column=-1)  # get the mean and std of the "prob" colum

 data = get_dataset_by_area(Atr, Xtr, ytr)
 n_areas = len(data)
--- a/Census/commons.py
+++ b/Census/commons.py
@ -1,5 +1,10 @@
 import numpy as np
 import pandas as pd
+from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
+
+from quapy.data import LabelledCollection
+from quapy.method.base import BaseQuantifier
+import quapy.functional as F
 from sklearn.preprocessing import StandardScaler

 np.set_printoptions(linewidth=np.inf)
@ -43,7 +48,7 @@ def get_dataset_by_area(A, X, y=None):

 class AdjMatrix:

-    def __init__(self, path):
+    def __init__(self, path, add_diagonal=False):
        df = pd.read_csv(path)

        area_codes = df.columns[1:].values
@ -54,7 +59,12 @@ class AdjMatrix:
        print(values)
        self.area2idx = {area:i for i, area in enumerate(area_codes)}
        self.idx2area = area_codes
-        self.M = np.asarray(values)
+        self.M = np.asarray(values, dtype=int)
+        if add_diagonal:
+            # adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when
+            # the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when
+            # the model is trained and tested on survey_y.csv
+            self.M += np.eye(self.M.shape[0], dtype=int)

    def adjacent(self, cod_1, cod_2):
        idx1 = self.area2idx[cod_1]
@ -87,4 +97,76 @@ class Preprocessor:
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

+    def get_mean_std(self, column):
+        mean = self.scaler.mean_[column]
+        std  = self.scaler.scale_[column]
+        return mean, std

+
+class StatModel(BaseQuantifier):
+    """
+    This method is a wrapper that simply returns the expected value of column "prob" as the prediction.
+    The column "prob" comes from a different model used by our statiticians and is pre-computed, so this
+    method actually simply reports the average.
+
+    :param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
+        it is the last column either in survey_y.csv and cens_y.csv
+    :param mean: indicates the mean of the column. If specified, then the column is assumed to be
+        standardized, and the inverse function is applied in order to recover the posterior probability
+        in the range [0,1]
+    :param scale: indicates the scale of the column. If specified, then the column is assumed to be
+        standardized, and the inverse function is applied in order to recover the posterior probability
+        in the range [0,1]
+    """
+    def __init__(self, posteriors_column=-1, mean=0, scale=1):
+        self.posteriors_column = posteriors_column
+        self.mean = mean
+        self.scale = scale
+
+    def fit(self, data: LabelledCollection):
+        return self
+
+    def quantify(self, instances):
+        prob = instances[:, self.posteriors_column]
+        # reconvert the z-scored variable to its original status
+        prob = zscore_inv(prob, self.mean, self.scale)
+        prob_ave = np.mean(prob)
+        print('Model', prob_ave)
+        prev = F.as_binary_prevalence(prob_ave)
+        return prev
+
+
+def zscore_inv(X, mean, scale):
+    return X*scale + mean
+
+
+class StatModelLR(BaseQuantifier):
+    """
+    This method is a wrapper that recalibrates the column "prob" via Logistic Regression.
+    The column "prob" comes from a different model used by our statiticians and is pre-computed.
+
+    :param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
+        it is the last column either in survey_y.csv and cens_y.csv
+    """
+    def __init__(self, posteriors_column=-1, mean=0, scale=1):
+        self.posteriors_column = posteriors_column
+        self.mean = mean
+        self.scale = scale
+        self.lr = LogisticRegressionCV()
+
+    def fit(self, data: LabelledCollection):
+        X = data.X[:,self.posteriors_column].reshape(-1,1)
+        # reconvert the z-scored variable to its original status
+        X = zscore_inv(X, self.mean, self.scale)
+        y = data.y
+        self.lr.fit(X, y)
+        return self
+
+    def quantify(self, instances):
+        prob = instances[:, self.posteriors_column].reshape(-1,1)
+        # reconvert the z-scored variable to its original status
+        prob = zscore_inv(prob, self.mean, self.scale)
+        calib_prob = self.lr.predict_proba(prob)[:,-1]
+        prob_ave = np.mean(calib_prob)
+        prev = F.as_binary_prevalence(prob_ave)
+        return prev
--- a/Census/methods.py
+++ b/Census/methods.py
@ -39,7 +39,6 @@ class CombinationRule(ABC):
        return prevalence


-
 def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'):
    if hyper is None:
        hyper = {
--- a/Census/pairwise_2.py
+++ b/Census/pairwise_2.py
@ -23,6 +23,8 @@ def quantifiers():
    yield 'ACC', ACC(cls)
    yield 'PACC', PACC(cls)
    yield 'SLD', SLD(cls)
+    yield 'SModelLR', StatModelLR()
+    yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)


 survey_y = './data/survey_y.csv'
@ -31,6 +33,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)

 preprocessor = Preprocessor()
 Xtr = preprocessor.fit_transform(Xtr)
+prob_mean, prob_std = preprocessor.get_mean_std(column=-1)  # get the mean and std of the "prob" colum

 trains = get_dataset_by_area(Atr, Xtr, ytr)
 n_areas = len(trains)
--- a/quapy/plot.py
+++ b/quapy/plot.py
@ -1,6 +1,6 @@
 from collections import defaultdict
 import matplotlib.pyplot as plt
-from matplotlib.cm import get_cmap
+from matplotlib.pyplot import get_cmap
 import numpy as np
 from matplotlib import cm
 from scipy.stats import ttest_ind_from_stats