last experiments before the meeting with unipi

This commit is contained in:
Alejandro Moreo Fernandez 2024-06-18 11:14:40 +02:00
parent 21d052313c
commit 4f3a6a4169
6 changed files with 100 additions and 10 deletions

View File

@ -14,17 +14,21 @@ from copy import deepcopy
np.set_printoptions(linewidth=np.inf)
def classifier():
return LogisticRegressionCV()
def quantifiers():
cls = classifier()
yield 'MLPE', MLPE()
yield 'CC', CC(cls)
# yield 'CC', CC(cls)
yield 'PCC', PCC(cls)
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
yield 'MS', MS(cls)
# yield 'ACC', ACC(cls)
# yield 'PACC', PACC(cls)
# yield 'MS', MS(cls)
yield 'SModelLR', StatModelLR()
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
# yield 'MS2', MS2(cls)
# yield 'SLD', EMQ(cls)
@ -35,6 +39,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
data = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(data)
@ -58,10 +63,8 @@ for aggr in ['median', 'mean']:
table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
table.format.mean_prec = 4
table.format.show_std = False
table.format.sta = False
table.format.remove_zero = True
for q_name, q in quantifiers():
# pretrain quantifiers per area
pretrained_area_q = []

View File

@ -25,6 +25,8 @@ def quantifiers():
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
yield 'SLD', EMQ(cls)
yield 'SModelLR', StatModelLR()
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
survey_y = './data/survey_y.csv'
@ -33,6 +35,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
data = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(data)

View File

@ -1,5 +1,10 @@
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier
import quapy.functional as F
from sklearn.preprocessing import StandardScaler
np.set_printoptions(linewidth=np.inf)
@ -43,7 +48,7 @@ def get_dataset_by_area(A, X, y=None):
class AdjMatrix:
def __init__(self, path):
def __init__(self, path, add_diagonal=False):
df = pd.read_csv(path)
area_codes = df.columns[1:].values
@ -54,7 +59,12 @@ class AdjMatrix:
print(values)
self.area2idx = {area:i for i, area in enumerate(area_codes)}
self.idx2area = area_codes
self.M = np.asarray(values)
self.M = np.asarray(values, dtype=int)
if add_diagonal:
# adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when
# the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when
# the model is trained and tested on survey_y.csv
self.M += np.eye(self.M.shape[0], dtype=int)
def adjacent(self, cod_1, cod_2):
idx1 = self.area2idx[cod_1]
@ -87,4 +97,76 @@ class Preprocessor:
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
def get_mean_std(self, column):
mean = self.scaler.mean_[column]
std = self.scaler.scale_[column]
return mean, std
class StatModel(BaseQuantifier):
"""
This method is a wrapper that simply returns the expected value of column "prob" as the prediction.
The column "prob" comes from a different model used by our statiticians and is pre-computed, so this
method actually simply reports the average.
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
it is the last column either in survey_y.csv and cens_y.csv
:param mean: indicates the mean of the column. If specified, then the column is assumed to be
standardized, and the inverse function is applied in order to recover the posterior probability
in the range [0,1]
:param scale: indicates the scale of the column. If specified, then the column is assumed to be
standardized, and the inverse function is applied in order to recover the posterior probability
in the range [0,1]
"""
def __init__(self, posteriors_column=-1, mean=0, scale=1):
self.posteriors_column = posteriors_column
self.mean = mean
self.scale = scale
def fit(self, data: LabelledCollection):
return self
def quantify(self, instances):
prob = instances[:, self.posteriors_column]
# reconvert the z-scored variable to its original status
prob = zscore_inv(prob, self.mean, self.scale)
prob_ave = np.mean(prob)
print('Model', prob_ave)
prev = F.as_binary_prevalence(prob_ave)
return prev
def zscore_inv(X, mean, scale):
return X*scale + mean
class StatModelLR(BaseQuantifier):
"""
This method is a wrapper that recalibrates the column "prob" via Logistic Regression.
The column "prob" comes from a different model used by our statiticians and is pre-computed.
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
it is the last column either in survey_y.csv and cens_y.csv
"""
def __init__(self, posteriors_column=-1, mean=0, scale=1):
self.posteriors_column = posteriors_column
self.mean = mean
self.scale = scale
self.lr = LogisticRegressionCV()
def fit(self, data: LabelledCollection):
X = data.X[:,self.posteriors_column].reshape(-1,1)
# reconvert the z-scored variable to its original status
X = zscore_inv(X, self.mean, self.scale)
y = data.y
self.lr.fit(X, y)
return self
def quantify(self, instances):
prob = instances[:, self.posteriors_column].reshape(-1,1)
# reconvert the z-scored variable to its original status
prob = zscore_inv(prob, self.mean, self.scale)
calib_prob = self.lr.predict_proba(prob)[:,-1]
prob_ave = np.mean(calib_prob)
prev = F.as_binary_prevalence(prob_ave)
return prev

View File

@ -39,7 +39,6 @@ class CombinationRule(ABC):
return prevalence
def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'):
if hyper is None:
hyper = {

View File

@ -23,6 +23,8 @@ def quantifiers():
yield 'ACC', ACC(cls)
yield 'PACC', PACC(cls)
yield 'SLD', SLD(cls)
yield 'SModelLR', StatModelLR()
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
survey_y = './data/survey_y.csv'
@ -31,6 +33,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
preprocessor = Preprocessor()
Xtr = preprocessor.fit_transform(Xtr)
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
trains = get_dataset_by_area(Atr, Xtr, ytr)
n_areas = len(trains)

View File

@ -1,6 +1,6 @@
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.pyplot import get_cmap
import numpy as np
from matplotlib import cm
from scipy.stats import ttest_ind_from_stats