last experiments before the meeting with unipi
This commit is contained in:
parent
21d052313c
commit
4f3a6a4169
|
@ -14,17 +14,21 @@ from copy import deepcopy
|
|||
|
||||
np.set_printoptions(linewidth=np.inf)
|
||||
|
||||
|
||||
def classifier():
|
||||
return LogisticRegressionCV()
|
||||
|
||||
|
||||
def quantifiers():
|
||||
cls = classifier()
|
||||
yield 'MLPE', MLPE()
|
||||
yield 'CC', CC(cls)
|
||||
# yield 'CC', CC(cls)
|
||||
yield 'PCC', PCC(cls)
|
||||
yield 'ACC', ACC(cls)
|
||||
yield 'PACC', PACC(cls)
|
||||
yield 'MS', MS(cls)
|
||||
# yield 'ACC', ACC(cls)
|
||||
# yield 'PACC', PACC(cls)
|
||||
# yield 'MS', MS(cls)
|
||||
yield 'SModelLR', StatModelLR()
|
||||
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
|
||||
# yield 'MS2', MS2(cls)
|
||||
# yield 'SLD', EMQ(cls)
|
||||
|
||||
|
@ -35,6 +39,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
|
|||
|
||||
preprocessor = Preprocessor()
|
||||
Xtr = preprocessor.fit_transform(Xtr)
|
||||
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
|
||||
|
||||
data = get_dataset_by_area(Atr, Xtr, ytr)
|
||||
n_areas = len(data)
|
||||
|
@ -58,10 +63,8 @@ for aggr in ['median', 'mean']:
|
|||
table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
|
||||
table.format.mean_prec = 4
|
||||
table.format.show_std = False
|
||||
table.format.sta = False
|
||||
table.format.remove_zero = True
|
||||
|
||||
|
||||
for q_name, q in quantifiers():
|
||||
# pretrain quantifiers per area
|
||||
pretrained_area_q = []
|
||||
|
|
|
@ -25,6 +25,8 @@ def quantifiers():
|
|||
yield 'ACC', ACC(cls)
|
||||
yield 'PACC', PACC(cls)
|
||||
yield 'SLD', EMQ(cls)
|
||||
yield 'SModelLR', StatModelLR()
|
||||
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
|
||||
|
||||
|
||||
survey_y = './data/survey_y.csv'
|
||||
|
@ -33,6 +35,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
|
|||
|
||||
preprocessor = Preprocessor()
|
||||
Xtr = preprocessor.fit_transform(Xtr)
|
||||
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
|
||||
|
||||
data = get_dataset_by_area(Atr, Xtr, ytr)
|
||||
n_areas = len(data)
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
|
||||
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.base import BaseQuantifier
|
||||
import quapy.functional as F
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
np.set_printoptions(linewidth=np.inf)
|
||||
|
@ -43,7 +48,7 @@ def get_dataset_by_area(A, X, y=None):
|
|||
|
||||
class AdjMatrix:
|
||||
|
||||
def __init__(self, path):
|
||||
def __init__(self, path, add_diagonal=False):
|
||||
df = pd.read_csv(path)
|
||||
|
||||
area_codes = df.columns[1:].values
|
||||
|
@ -54,7 +59,12 @@ class AdjMatrix:
|
|||
print(values)
|
||||
self.area2idx = {area:i for i, area in enumerate(area_codes)}
|
||||
self.idx2area = area_codes
|
||||
self.M = np.asarray(values)
|
||||
self.M = np.asarray(values, dtype=int)
|
||||
if add_diagonal:
|
||||
# adding the diagonal has the effect of considering an area be adjacent to itself. This is useful when
|
||||
# the model is trained using survey_y.csv data and tested using cens_y.csv, but should not be done when
|
||||
# the model is trained and tested on survey_y.csv
|
||||
self.M += np.eye(self.M.shape[0], dtype=int)
|
||||
|
||||
def adjacent(self, cod_1, cod_2):
|
||||
idx1 = self.area2idx[cod_1]
|
||||
|
@ -87,4 +97,76 @@ class Preprocessor:
|
|||
def fit_transform(self, X, y=None):
|
||||
return self.fit(X, y).transform(X)
|
||||
|
||||
def get_mean_std(self, column):
|
||||
mean = self.scaler.mean_[column]
|
||||
std = self.scaler.scale_[column]
|
||||
return mean, std
|
||||
|
||||
|
||||
class StatModel(BaseQuantifier):
|
||||
"""
|
||||
This method is a wrapper that simply returns the expected value of column "prob" as the prediction.
|
||||
The column "prob" comes from a different model used by our statiticians and is pre-computed, so this
|
||||
method actually simply reports the average.
|
||||
|
||||
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
|
||||
it is the last column either in survey_y.csv and cens_y.csv
|
||||
:param mean: indicates the mean of the column. If specified, then the column is assumed to be
|
||||
standardized, and the inverse function is applied in order to recover the posterior probability
|
||||
in the range [0,1]
|
||||
:param scale: indicates the scale of the column. If specified, then the column is assumed to be
|
||||
standardized, and the inverse function is applied in order to recover the posterior probability
|
||||
in the range [0,1]
|
||||
"""
|
||||
def __init__(self, posteriors_column=-1, mean=0, scale=1):
|
||||
self.posteriors_column = posteriors_column
|
||||
self.mean = mean
|
||||
self.scale = scale
|
||||
|
||||
def fit(self, data: LabelledCollection):
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
prob = instances[:, self.posteriors_column]
|
||||
# reconvert the z-scored variable to its original status
|
||||
prob = zscore_inv(prob, self.mean, self.scale)
|
||||
prob_ave = np.mean(prob)
|
||||
print('Model', prob_ave)
|
||||
prev = F.as_binary_prevalence(prob_ave)
|
||||
return prev
|
||||
|
||||
|
||||
def zscore_inv(X, mean, scale):
|
||||
return X*scale + mean
|
||||
|
||||
|
||||
class StatModelLR(BaseQuantifier):
|
||||
"""
|
||||
This method is a wrapper that recalibrates the column "prob" via Logistic Regression.
|
||||
The column "prob" comes from a different model used by our statiticians and is pre-computed.
|
||||
|
||||
:param posteriors_column: index of the column "prob" in the csv. The default value is -1 since
|
||||
it is the last column either in survey_y.csv and cens_y.csv
|
||||
"""
|
||||
def __init__(self, posteriors_column=-1, mean=0, scale=1):
|
||||
self.posteriors_column = posteriors_column
|
||||
self.mean = mean
|
||||
self.scale = scale
|
||||
self.lr = LogisticRegressionCV()
|
||||
|
||||
def fit(self, data: LabelledCollection):
|
||||
X = data.X[:,self.posteriors_column].reshape(-1,1)
|
||||
# reconvert the z-scored variable to its original status
|
||||
X = zscore_inv(X, self.mean, self.scale)
|
||||
y = data.y
|
||||
self.lr.fit(X, y)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
prob = instances[:, self.posteriors_column].reshape(-1,1)
|
||||
# reconvert the z-scored variable to its original status
|
||||
prob = zscore_inv(prob, self.mean, self.scale)
|
||||
calib_prob = self.lr.predict_proba(prob)[:,-1]
|
||||
prob_ave = np.mean(calib_prob)
|
||||
prev = F.as_binary_prevalence(prob_ave)
|
||||
return prev
|
|
@ -39,7 +39,6 @@ class CombinationRule(ABC):
|
|||
return prevalence
|
||||
|
||||
|
||||
|
||||
def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'):
|
||||
if hyper is None:
|
||||
hyper = {
|
||||
|
|
|
@ -23,6 +23,8 @@ def quantifiers():
|
|||
yield 'ACC', ACC(cls)
|
||||
yield 'PACC', PACC(cls)
|
||||
yield 'SLD', SLD(cls)
|
||||
yield 'SModelLR', StatModelLR()
|
||||
yield 'SModel', StatModel(mean=prob_mean, scale=prob_std)
|
||||
|
||||
|
||||
survey_y = './data/survey_y.csv'
|
||||
|
@ -31,6 +33,7 @@ Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
|
|||
|
||||
preprocessor = Preprocessor()
|
||||
Xtr = preprocessor.fit_transform(Xtr)
|
||||
prob_mean, prob_std = preprocessor.get_mean_std(column=-1) # get the mean and std of the "prob" colum
|
||||
|
||||
trains = get_dataset_by_area(Atr, Xtr, ytr)
|
||||
n_areas = len(trains)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from collections import defaultdict
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.cm import get_cmap
|
||||
from matplotlib.pyplot import get_cmap
|
||||
import numpy as np
|
||||
from matplotlib import cm
|
||||
from scipy.stats import ttest_ind_from_stats
|
||||
|
|
Loading…
Reference in New Issue