diff --git a/Census/adjacentconcat_4.py b/Census/adjacentconcat_4.py new file mode 100644 index 0000000..50e25eb --- /dev/null +++ b/Census/adjacentconcat_4.py @@ -0,0 +1,87 @@ +import numpy as np +from sklearn.linear_model import LogisticRegressionCV + +from quapy.data import LabelledCollection +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE +from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ +from commons import * +from table import Table +from tqdm import tqdm +import quapy as qp + + +np.set_printoptions(linewidth=np.inf) + +def classifier(): + return LogisticRegressionCV() + +def quantifiers(): + cls = classifier() + yield 'MLPE', MLPE() + yield 'CC', CC(cls) + yield 'PCC', PCC(cls) + yield 'ACC', ACC(cls) + yield 'PACC', PACC(cls) + yield 'SLD', EMQ(cls) + + +survey_y = './data/survey_y.csv' + +Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) + +preprocessor = Preprocessor() +Xtr = preprocessor.fit_transform(Xtr) + +data = get_dataset_by_area(Atr, Xtr, ytr) +n_areas = len(data) + +Madj = AdjMatrix('./data/matrice_adiacenza.csv') + +areas = [Ai for Ai, _, _ in data] +q_names = [q_name for q_name, _ in quantifiers()] + +# tables = [] +text_outputs = [] + +benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test + +# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier +# has been trained on all areas but 46 +methods = [f'{q_name}-cat' for q_name in q_names] + +table = Table(name='adjacentconcat', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local') +table.format.mean_prec = 4 +table.format.show_std = False +table.format.sta = False +table.format.remove_zero = True + +for q_name, q in quantifiers(): + for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas): + #training + trainings = [LabelledCollection(Xj, yj) for Aj, Xj, yj in data if Aj!=Ai and Aj in Madj.get_adjacent(Ai)] + print(f'for test Ai={Ai} there should be {Madj.get_adjacent(Ai)}: len={len(trainings)}') + tr = LabelledCollection.join(*trainings) + q.fit(tr) + + #test + te = LabelledCollection(Xi, yi) + qp.environ["SAMPLE_SIZE"] = len(te) + pred_prev = q.quantify(te.X) + true_prev = te.prevalence() + err = qp.error.mae(true_prev, pred_prev) + + method_name = f'{q_name}-cat' + table.add(benchmark=f'te-{Ai}', method=method_name, v=err) + + # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}') + + +Table.LatexPDF(f'./results/adjacentconcat/doc.pdf', [table]) + +# with open(f'./results/classifier/output.txt', 'tw') as foo: +# foo.write('\n'.join(text_outputs)) + + + + + diff --git a/Census/adjacentmedian_4.1.py b/Census/adjacentmedian_4.1.py new file mode 100644 index 0000000..339b509 --- /dev/null +++ b/Census/adjacentmedian_4.1.py @@ -0,0 +1,101 @@ +import numpy as np +from sklearn.linear_model import LogisticRegressionCV + +from Census.methods import AreaQuantifier, AggregationRule +from quapy.data import LabelledCollection +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE +from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ, MS, MS2 +from commons import * +from table import Table +from tqdm import tqdm +import quapy as qp +from copy import deepcopy + + +np.set_printoptions(linewidth=np.inf) + +def classifier(): + return LogisticRegressionCV() + +def quantifiers(): + cls = classifier() + yield 'MLPE', MLPE() + yield 'CC', CC(cls) + yield 'PCC', PCC(cls) + yield 'ACC', ACC(cls) + yield 'PACC', PACC(cls) + yield 'MS', MS(cls) + # yield 'MS2', MS2(cls) + # yield 'SLD', EMQ(cls) + + +survey_y = './data/survey_y.csv' + +Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) + +preprocessor = Preprocessor() +Xtr = preprocessor.fit_transform(Xtr) + +data = get_dataset_by_area(Atr, Xtr, ytr) +n_areas = len(data) + +areas = [Ai for Ai, _, _ in data] +q_names = [q_name for q_name, _ in quantifiers()] + +Madj = AdjMatrix('./data/matrice_adiacenza.csv') + +tables = [] +text_outputs = [] + +benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test + +for aggr in ['median', 'mean']: + + # areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier + # has been trained on all areas but 46 + methods = [f'{q_name}-{aggr}' for q_name in q_names] + + table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local') + table.format.mean_prec = 4 + table.format.show_std = False + table.format.sta = False + table.format.remove_zero = True + + + for q_name, q in quantifiers(): + # pretrain quantifiers per area + pretrained_area_q = [] + for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas): + q_i = deepcopy(q) + q_i.fit(LabelledCollection(Xi, yi)) + pretrained_area_q.append(AreaQuantifier(Ai, q_i)) + + for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas): + # compose members of the rule (quantifiers are already fit) + #training + area_quantifiers = [qA_j for qA_j in pretrained_area_q if qA_j.area != Ai] + rule = AggregationRule(area_quantifiers, adjacent_matrix=Madj, aggr=aggr) + + #test + te = LabelledCollection(Xi, yi) + qp.environ["SAMPLE_SIZE"] = len(te) + pred_prev = rule.predict(Ai, te.X) + true_prev = te.prevalence() + err = qp.error.mae(true_prev, pred_prev) + + method_name = f'{q_name}-{aggr}' + table.add(benchmark=f'te-{Ai}', method=method_name, v=err) + + # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}') + + tables.append(table) + +Table.LatexPDF(f'./results/adjacentaggregation/doc.pdf', tables) + +# with open(f'./results/classifier/output.txt', 'tw') as foo: +# foo.write('\n'.join(text_outputs)) + + + + + diff --git a/Census/adjacentmedianoptim_4.2.py b/Census/adjacentmedianoptim_4.2.py new file mode 100644 index 0000000..77ef33a --- /dev/null +++ b/Census/adjacentmedianoptim_4.2.py @@ -0,0 +1,95 @@ +import numpy as np +from sklearn.linear_model import LogisticRegressionCV, LogisticRegression + +from Census.methods import AreaQuantifier, AggregationRule, optimize_ensemble +from quapy.data import LabelledCollection +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE +from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ, MS, MS2 +from commons import * +from table import Table +from tqdm import tqdm +import quapy as qp +from copy import deepcopy + + +np.set_printoptions(linewidth=np.inf) + +def classifier(): + return LogisticRegression() + +def quantifiers(): + cls = classifier() + # yield 'MLPE', MLPE() + yield 'CC', CC(cls) + yield 'PCC', PCC(cls) + yield 'ACC', ACC(cls) + yield 'PACC', PACC(cls) + yield 'MS', MS(cls) + # yield 'MS2', MS2(cls) + # yield 'SLD', EMQ(cls) + + +survey_y = './data/survey_y.csv' + +Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) + +preprocessor = Preprocessor() +Xtr = preprocessor.fit_transform(Xtr) + +data = get_dataset_by_area(Atr, Xtr, ytr) +n_areas = len(data) + +areas = [Ai for Ai, _, _ in data] +q_names = [q_name for q_name, _ in quantifiers()] + +Madj = AdjMatrix('./data/matrice_adiacenza.csv') + +tables = [] +text_outputs = [] + +benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test + +for aggr in ['median', 'mean']: + + # areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier + # has been trained on all areas but 46 + methods = [f'{q_name}-{aggr}' for q_name in q_names] + + table = Table(name=f'adjacent{aggr}optim', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local') + table.format.mean_prec = 4 + table.format.show_std = False + table.format.sta = False + table.format.remove_zero = True + + + for q_name, q in quantifiers(): + for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas): + # compose members of the rule (quantifiers are optimized wrt the rest of the areas) + #training + other_area = [(Aj, Xj, yj) for Aj, Xj, yj in data if Aj != Ai] + area_quantifiers = optimize_ensemble(other_area, q, Madj) + rule = AggregationRule(area_quantifiers, adjacent_matrix=Madj, aggr=aggr) + + #test + te = LabelledCollection(Xi, yi) + qp.environ["SAMPLE_SIZE"] = len(te) + pred_prev = rule.predict(Ai, te.X) + true_prev = te.prevalence() + err = qp.error.mae(true_prev, pred_prev) + + method_name = f'{q_name}-{aggr}' + table.add(benchmark=f'te-{Ai}', method=method_name, v=err) + + # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}') + + tables.append(table) + +Table.LatexPDF(f'./results/adjacentaggregationoptim/doc.pdf', tables) + +# with open(f'./results/classifier/output.txt', 'tw') as foo: +# foo.write('\n'.join(text_outputs)) + + + + + diff --git a/Census/allconcat_3.py b/Census/allconcat_3.py new file mode 100644 index 0000000..b974b29 --- /dev/null +++ b/Census/allconcat_3.py @@ -0,0 +1,84 @@ +import numpy as np +from sklearn.linear_model import LogisticRegressionCV + +from quapy.data import LabelledCollection +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE +from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ +from commons import * +from table import Table +from tqdm import tqdm +import quapy as qp + + +np.set_printoptions(linewidth=np.inf) + +def classifier(): + return LogisticRegressionCV() + +def quantifiers(): + cls = classifier() + yield 'MLPE', MLPE() + yield 'CC', CC(cls) + yield 'PCC', PCC(cls) + yield 'ACC', ACC(cls) + yield 'PACC', PACC(cls) + yield 'SLD', EMQ(cls) + + +survey_y = './data/survey_y.csv' + +Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) + +preprocessor = Preprocessor() +Xtr = preprocessor.fit_transform(Xtr) + +data = get_dataset_by_area(Atr, Xtr, ytr) +n_areas = len(data) + +areas = [Ai for Ai, _, _ in data] +q_names = [q_name for q_name, _ in quantifiers()] + +# tables = [] +text_outputs = [] + +benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test + +# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier +# has been trained on all areas but 46 +methods = [f'{q_name}-cat' for q_name in q_names] + +table = Table(name='allconcat', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local') +table.format.mean_prec = 4 +table.format.show_std = False +table.format.sta = False +table.format.remove_zero = True + +for q_name, q in quantifiers(): + for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas): + #training + trainings = [LabelledCollection(Xj, yj) for Aj, Xj, yj in data if Aj!=Ai] + tr = LabelledCollection.join(*trainings) + q.fit(tr) + + #test + te = LabelledCollection(Xi, yi) + qp.environ["SAMPLE_SIZE"] = len(te) + pred_prev = q.quantify(te.X) + true_prev = te.prevalence() + err = qp.error.mae(true_prev, pred_prev) + + method_name = f'{q_name}-cat' + table.add(benchmark=f'te-{Ai}', method=method_name, v=err) + + # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}') + + +Table.LatexPDF(f'./results/allconcat/doc.pdf', [table]) + +# with open(f'./results/classifier/output.txt', 'tw') as foo: +# foo.write('\n'.join(text_outputs)) + + + + + diff --git a/Census/allmedian_3.1.py b/Census/allmedian_3.1.py new file mode 100644 index 0000000..7380f28 --- /dev/null +++ b/Census/allmedian_3.1.py @@ -0,0 +1,96 @@ +import numpy as np +from sklearn.linear_model import LogisticRegressionCV + +from Census.methods import AreaQuantifier, AggregationRule +from quapy.data import LabelledCollection +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE +from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ +from commons import * +from table import Table +from tqdm import tqdm +import quapy as qp +from copy import deepcopy + + +np.set_printoptions(linewidth=np.inf) + +def classifier(): + return LogisticRegressionCV() + +def quantifiers(): + cls = classifier() + yield 'MLPE', MLPE() + yield 'CC', CC(cls) + yield 'PCC', PCC(cls) + yield 'ACC', ACC(cls) + yield 'PACC', PACC(cls) + yield 'SLD', EMQ(cls) + + +survey_y = './data/survey_y.csv' + +Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) + +preprocessor = Preprocessor() +Xtr = preprocessor.fit_transform(Xtr) + +data = get_dataset_by_area(Atr, Xtr, ytr) +n_areas = len(data) + +areas = [Ai for Ai, _, _ in data] +q_names = [q_name for q_name, _ in quantifiers()] + +tables = [] +text_outputs = [] + +benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test + +for aggr in ['median', 'mean']: + + # areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier + # has been trained on all areas but 46 + methods = [f'{q_name}-{aggr}' for q_name in q_names] + + table = Table(name=f'all{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local') + table.format.mean_prec = 4 + table.format.show_std = False + table.format.sta = False + table.format.remove_zero = True + + for q_name, q in quantifiers(): + # pretrain quantifiers per area + pretrained_area_q = [] + for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas): + q_i = deepcopy(q) + q_i.fit(LabelledCollection(Xi, yi)) + pretrained_area_q.append(AreaQuantifier(Ai, q_i)) + + for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas): + # compose members of the rule (quantifiers are already fit) + #training + area_quantifiers = [qA_j for qA_j in pretrained_area_q if qA_j.area != Ai] + rule = AggregationRule(area_quantifiers, aggr=aggr) + + #test + te = LabelledCollection(Xi, yi) + qp.environ["SAMPLE_SIZE"] = len(te) + pred_prev = rule.predict(Ai, te.X) + true_prev = te.prevalence() + err = qp.error.mae(true_prev, pred_prev) + + method_name = f'{q_name}-{aggr}' + table.add(benchmark=f'te-{Ai}', method=method_name, v=err) + + # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}') + + tables.append(table) + +Table.LatexPDF(f'./results/allaggregation/doc.pdf', tables) + + # with open(f'./results/classifier/output.txt', 'tw') as foo: + # foo.write('\n'.join(text_outputs)) + + + + + diff --git a/Census/classification_accuracy_1.py b/Census/classification_accuracy_1.py new file mode 100644 index 0000000..3354f84 --- /dev/null +++ b/Census/classification_accuracy_1.py @@ -0,0 +1,70 @@ +import numpy as np +from sklearn.svm import SVC + +from commons import * +from table import Table + + +np.set_printoptions(linewidth=np.inf) + +def classifiers(): + yield 'LR-opt', LogisticRegressionCV(class_weight='balanced', Cs=10) + yield 'LR-def', LogisticRegressionCV() + yield 'SVM-linear', LinearSVC() + yield 'SVM-rbf', SVC(kernel='rbf') + + +survey_y = './data/survey_y.csv' + +Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) + +preprocessor = Preprocessor() +Xtr = preprocessor.fit_transform(Xtr) + +trains = get_dataset_by_area(Atr, Xtr, ytr) +n_areas = len(trains) + +areas = [Ai for Ai, _, _ in trains] + +tables = [] +text_outputs = [] + +benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test +methods = [f'tr-{Ai}' for Ai in areas] # areas on which a quantifier is trained + +for cls_name, c in classifiers(): + + table = Table(name=cls_name, benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local', lower_is_better=False) + table.format.mean_prec = 4 + table.format.show_std = False + table.format.sta = False + table.format.remove_zero = True + + for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_areas): + c.fit(Xi, yi) + for j, (Aj, Xj, yj) in enumerate(trains): + if i==j: continue + pred_labels = c.predict(Xj) + true_labels = yj + acc = (pred_labels==true_labels).mean() + table.add(benchmark=f'te-{Aj}', method=f'tr-{Ai}', v=acc) + + for test in benchmarks: + values = table.get_benchmark_values(test) + table.add(benchmark=test, method='Best', v=max(values)) + table.add(benchmark=test, method='Worst', v=min(values)) + table.add(benchmark=test, method='AVE', v=np.mean(values)) + + tables.append(table) + + text_outputs.append(f'{cls_name} got mean {table.all_mean():.5f}') + + +Table.LatexPDF(f'./results/classifier/doc.pdf', tables) +with open(f'./results/classifier/output.txt', 'tw') as foo: + foo.write('\n'.join(text_outputs)) + + + + + diff --git a/Census/commons.py b/Census/commons.py new file mode 100644 index 0000000..66ac393 --- /dev/null +++ b/Census/commons.py @@ -0,0 +1,90 @@ +import numpy as np +import pandas as pd +from sklearn.preprocessing import StandardScaler + +np.set_printoptions(linewidth=np.inf) + + +def load_csv(file, use_yhat=True): + df = pd.read_csv(file) + + cod_area = 'cod.prov' + if use_yhat: + covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob'] + else: + covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob'] + y_true = 'y.true' + + X = df[covariates].values + A = df[cod_area].values + + # for i, cov in enumerate(covariates): + # print(f'values of col {i} "{cov}" {np.unique(X[:,i])}') + + if y_true in df.columns: + y = df[y_true].values + return A, X, y + else: + return A, X + + +def get_dataset_by_area(A, X, y=None): + data = [] + for area in np.unique(A): + sel = (A == area) + Xsel = X[sel] + if y is not None: + ysel = y[sel] + else: + ysel = None + data.append((area, Xsel, ysel)) + return data + + +class AdjMatrix: + + def __init__(self, path): + df = pd.read_csv(path) + + area_codes = df.columns[1:].values + area_codes = np.asarray([int(c) for c in area_codes]) + + values = df.values[:, 1:] + print(area_codes) + print(values) + self.area2idx = {area:i for i, area in enumerate(area_codes)} + self.idx2area = area_codes + self.M = np.asarray(values) + + def adjacent(self, cod_1, cod_2): + idx1 = self.area2idx[cod_1] + idx2 = self.area2idx[cod_2] + return (self.M[idx1, idx2] == 1) + + def get_adjacent(self, cod): + idx = self.area2idx[cod] + idx_adj = np.argwhere(self.M[idx]==1).flatten() + return self.idx2area[idx_adj] + + +class Preprocessor: + def __init__(self): + self.scaler = StandardScaler() + # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize + self.standardize_col_ids = np.arange(8) # everything + + def fit(self, X, y=None): + Xsel = X[:, self.standardize_col_ids] + self.scaler.fit(Xsel) + return self + + def transform(self, X): + Xsel = X[:, self.standardize_col_ids] + Xsel_zscore = self.scaler.transform(Xsel) + X[:, self.standardize_col_ids] = Xsel_zscore + return X + + def fit_transform(self, X, y=None): + return self.fit(X, y).transform(X) + + diff --git a/Census/main.py b/Census/main.py index 55ada7c..9b15cad 100644 --- a/Census/main.py +++ b/Census/main.py @@ -9,6 +9,7 @@ from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation a from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS, ACC from quapy.data import LabelledCollection from sklearn.preprocessing import StandardScaler +from commons import * np.set_printoptions(linewidth=np.inf) @@ -16,67 +17,6 @@ np.set_printoptions(linewidth=np.inf) cens_y = './data/cens_y.csv' survey_y = './data/survey_y.csv' - -def load_csv(file, use_yhat=True): - df = pd.read_csv(file) - - cod_area = 'cod.prov' - if use_yhat: - covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob'] - else: - covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob'] - y_true = 'y.true' - - X = df[covariates].values - A = df[cod_area].values - - for i, cov in enumerate(covariates): - print(f'values of col {i} "{cov}" {np.unique(X[:,i])}') - - if y_true in df.columns: - y = df[y_true].values - return A, X, y - else: - return A, X - - -def get_dataset_by_area(A, X, y=None): - lc = [] - for area in np.unique(A): - sel = (A == area) - Xsel = X[sel] - if y is not None: - ysel = y[sel] - else: - ysel = None - lc.append((area, Xsel, ysel)) - return lc - - -class Preprocessor: - def __init__(self): - self.scaler = StandardScaler() - # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize - self.standardize_col_ids = np.arange(8) # everything - - def fit(self, X, y=None): - Xsel = X[:, self.standardize_col_ids] - self.scaler.fit(Xsel) - return self - - def transform(self, X): - Xsel = X[:, self.standardize_col_ids] - Xsel_zscore = self.scaler.transform(Xsel) - X[:, self.standardize_col_ids] = Xsel_zscore - return X - - def fit_transform(self, X, y=None): - return self.fit(X, y).transform(X) - - - - - # Ate, Xte = load_csv(cens_y) Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) diff --git a/Census/methods.py b/Census/methods.py new file mode 100644 index 0000000..c67bb61 --- /dev/null +++ b/Census/methods.py @@ -0,0 +1,111 @@ +from abc import abstractmethod, ABC +from copy import deepcopy +from typing import List, Iterable + +import numpy as np + +import quapy as qp +from quapy.method.aggregative import AggregativeQuantifier +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE +from quapy.data import LabelledCollection +from quapy.method.base import BaseQuantifier + + +class AreaQuantifier: + def __init__(self, area:int, quantifier: BaseQuantifier): + self.area = area + self.quantifier = quantifier + + def quantify(self, X): + return self.quantifier.quantify(X) + + +class CombinationRule(ABC): + + def __init__(self, area_quantifiers: List[AreaQuantifier]): + self.area_quantifiers = area_quantifiers + + @abstractmethod + def select_quantifiers(self, area:int, X): + ... + + @abstractmethod + def combination(self, choice, X): + ... + + def predict(self, area:int, X): + choice = self.select_quantifiers(area, X) + prevalence = self.combination(choice, X) + return prevalence + + + +def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'): + if hyper is None: + hyper = { + 'classifier__C': np.logspace(-4, 4, 9), + 'classifier__class_weight': ['balanced', None] + } + + labelled_collections = [(A, LabelledCollection(X, y)) for A, X, y in area_data] + + area_quantifiers = [] + for A, lc in labelled_collections: + if Madj is None: + rest = [lc_j for Aj, lc_j in labelled_collections if Aj != A] + else: + rest = [lc_j for Aj, lc_j in labelled_collections if Aj != A and Aj in Madj.get_adjacent(A)] + q = optim(q, lc, rest, hyper, error) + area_quantifiers.append(AreaQuantifier(A, q)) + + return area_quantifiers + + +class AggregationRule(CombinationRule): + + def __init__(self, area_quantifiers: List[AreaQuantifier], adjacent_matrix: 'AdjMatrix' = None, aggr='median'): + assert aggr in ['mean', 'median'], f'unknown {aggr=}' + self.area_quantifiers = area_quantifiers + self.adjacent_matrix = adjacent_matrix + self.aggr = aggr + + def select_quantifiers(self, area:int, X): + if self.adjacent_matrix is None: + chosen = self.area_quantifiers + else: + adjacent = self.adjacent_matrix.get_adjacent(area) + chosen = [q_i for q_i in self.area_quantifiers if q_i.area in adjacent] + return chosen + + def combination(self, choice, X): + prevs = np.asarray([q.quantify(X) for q in choice]) + if self.aggr == 'median': + prev = np.median(prevs, axis=0) + elif self.aggr == 'mean': + prev = np.mean(prevs, axis=0) + else: + raise NotImplementedError(f'{self.aggr=} not implemented') + return prev + + +def optim(q: BaseQuantifier, train: LabelledCollection, labelled_collections: Iterable[LabelledCollection], hyper:dict, error='mae'): + q = deepcopy(q) + + prot = qp.protocol.IterateProtocol(labelled_collections) + try: + mod_sel = qp.model_selection.GridSearchQ( + model=q, + param_grid=hyper, + protocol=prot, + error=error, + refit=False, + n_jobs=-1 + ).fit(train) + + fitted = mod_sel.best_model_ + except ValueError: + print(f'method {q} failed; training without model selection') + fitted = q.fit(train) + + return fitted + diff --git a/Census/pairwise_2.py b/Census/pairwise_2.py new file mode 100644 index 0000000..d96d94f --- /dev/null +++ b/Census/pairwise_2.py @@ -0,0 +1,86 @@ +import numpy as np +from sklearn.linear_model import LogisticRegressionCV + +from quapy.data import LabelledCollection +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE +from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ +from commons import * +from table import Table +from tqdm import tqdm +import quapy as qp + +np.set_printoptions(linewidth=np.inf) + +def classifier(): + #return LogisticRegressionCV(class_weight='balanced', Cs=10) + return LogisticRegressionCV() + +def quantifiers(): + cls = classifier() + yield 'MLPE', MLPE() + yield 'CC', CC(cls) + yield 'PCC', PCC(cls) + yield 'ACC', ACC(cls) + yield 'PACC', PACC(cls) + + +survey_y = './data/survey_y.csv' + +Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) + +preprocessor = Preprocessor() +Xtr = preprocessor.fit_transform(Xtr) + +trains = get_dataset_by_area(Atr, Xtr, ytr) +n_areas = len(trains) + +areas = [Ai for Ai, _, _ in trains] + +tables = [] +text_outputs = [] + +benchmarks = [f'te-{Ai}' for Ai in areas] # areas used as test +methods = [f'tr-{Ai}' for Ai in areas] # areas on which a quantifier is trained + + +for q_name, q in quantifiers(): + + table = Table(name=q_name, benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='global') + table.format.mean_prec = 4 + table.format.show_std = False + table.format.sta = False + table.format.remove_zero = True + table.with_mean = True + + for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_areas): + tr = LabelledCollection(Xi, yi) + q.fit(tr) + len_tr = len(tr) + for j, (Aj, Xj, yj) in enumerate(trains): + if i==j: continue + te = LabelledCollection(Xj, yj) + qp.environ["SAMPLE_SIZE"] = len(te) + pred_prev = q.quantify(te.X) + true_prev = te.prevalence() + # err = qp.error.mrae(true_prev, pred_prev) + err = qp.error.mae(true_prev, pred_prev) + table.add(benchmark=f'te-{Aj}', method=f'tr-{Ai}', v=err) + + for test in benchmarks: + values = table.get_benchmark_values(test) + table.add(benchmark=test, method='Best', v=min(values)) + + tables.append(table) + + text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}') + + +Table.LatexPDF(f'./results/pairwise/doc.pdf', tables) + +with open(f'./results/classifier/output.txt', 'tw') as foo: + foo.write('\n'.join(text_outputs)) + + + + + diff --git a/Census/table.py b/Census/table.py new file mode 100644 index 0000000..25f4f11 --- /dev/null +++ b/Census/table.py @@ -0,0 +1,476 @@ +import numpy as np +from typing import Union, List +from collections.abc import Iterable +from dataclasses import dataclass +from scipy.stats import wilcoxon, ttest_ind_from_stats +import pandas as pd +import os +from pathlib import Path + + +@dataclass +class CellFormat: + mean_prec: int = 3 + std_prec: int = 3 + show_std: bool = True + remove_zero: bool = False + color: bool = True + maxtone: int = 50 + + +class Cell: + + def __init__(self, format: CellFormat, group: 'CellGroup'): + self.values = [] + self.format = format + self.touch() + self.group = group + self.group.register_cell(self) + + def __len__(self): + return len(self.values) + + def mean(self): + if self.mean_ is None: + self.mean_ = np.mean(self.values) + return self.mean_ + + def std(self): + if self.std_ is None: + self.std_ = np.std(self.values) + return self.std_ + + def touch(self): + self.mean_ = None + self.std_ = None + + def append(self, v: Union[float,Iterable]): + if isinstance(v, Iterable): + self.values.extend(v) + self.values.append(v) + self.touch() + + def isEmpty(self): + return len(self)==0 + + def isBest(self): + best = self.group.best() + if best is not None: + return (best == self) or (np.isclose(best.mean(), self.mean())) + return False + + def print_mean(self): + if self.isEmpty(): + return '' + else: + return f'{self.mean():.{self.format.mean_prec}f}' + + def print(self): + if self.isEmpty(): + return '' + + # mean + # --------------------------------------------------- + mean = self.print_mean() + if self.format.remove_zero: + mean = mean.replace('0.', '.') + + # std ? + # --------------------------------------------------- + if self.format.show_std: + std = f' $\pm$ {self.std():.{self.format.std_prec}f}' + else: + std = '' + + # bold or statistical test + # --------------------------------------------------- + if self.isBest(): + str_cell = f'\\textbf{{{mean}{std}}}' + else: + comp_symbol = '' + pval = self.group.compare(self) + if pval is not None: + if 0.005 > pval: + comp_symbol = '' + elif 0.05 > pval >= 0.005: + comp_symbol = '$^{\dag}$' + elif pval >= 0.05: + comp_symbol = '${\ddag}$' + str_cell = f'{mean}{comp_symbol}{std}' + + # color ? + # --------------------------------------------------- + if self.format.color: + str_cell += ' ' + self.group.color(self) + + return str_cell + + +class CellGroup: + + def __init__(self, lower_is_better=True, stat_test='wilcoxon', color_mode='local', color_global_min=None, color_global_max=None): + assert stat_test in ['wilcoxon', 'ttest', None], \ + f"unknown {stat_test=}, valid ones are wilcoxon, ttest, or None" + assert color_mode in ['local', 'global'], \ + f"unknown {color_mode=}, valid ones are local and global" + if (color_global_min is not None or color_global_max is not None) and color_mode=='local': + print('warning: color_global_min and color_global_max are only considered when color_mode==local') + self.cells = [] + self.lower_is_better = lower_is_better + self.stat_test = stat_test + self.color_mode = color_mode + self.color_global_min = color_global_min + self.color_global_max = color_global_max + + def register_cell(self, cell: Cell): + self.cells.append(cell) + + def non_empty_cells(self): + return [c for c in self.cells if not c.isEmpty()] + + def max(self): + cells = self.non_empty_cells() + if len(cells)>0: + return cells[np.argmax([c.mean() for c in cells])] + return None + + def min(self): + cells = self.non_empty_cells() + if len(cells) > 0: + return cells[np.argmin([c.mean() for c in cells])] + return None + + def best(self) -> Cell: + return self.min() if self.lower_is_better else self.max() + + def worst(self) -> Cell: + return self.max() if self.lower_is_better else self.min() + + def isEmpty(self): + return len(self.non_empty_cells())==0 + + def compare(self, cell: Cell): + best = self.best() + best_n = len(best) + cell_n = len(cell) + if best_n > 0 and cell_n > 0: + if self.stat_test == 'wilcoxon': + try: + _, p_val = wilcoxon(best.values, cell.values) + except ValueError: + p_val = None + return p_val + elif self.stat_test == 'ttest': + best_mean, best_std = best.mean(), best.std() + cell_mean, cell_std = cell.mean(), cell.std() + _, p_val = ttest_ind_from_stats(best_mean, best_std, best_n, cell_mean, cell_std, cell_n) + return p_val + elif self.stat_test is None: + return None + else: + raise ValueError(f'unknown statistical test {self.stat_test}') + else: + return None + + def color(self, cell: Cell): + cell_mean = cell.mean() + + if self.color_mode == 'local': + best = self.best() + worst = self.worst() + best_mean = best.mean() + worst_mean = worst.mean() + + if best is None or worst is None or best_mean == worst_mean or cell.isEmpty(): + return '' + + # normalize val in [0,1] + maxval = max(best_mean, worst_mean) + minval = min(best_mean, worst_mean) + else: + maxval = self.color_global_max + minval = self.color_global_min + + normval = (cell_mean - minval) / (maxval - minval) + + if self.lower_is_better: + normval = 1 - normval + + normval = np.clip(normval, 0, 1) + + normval = normval * 2 - 1 # rescale to [-1,1] + if normval < 0: + color = 'red' + tone = cell.format.maxtone * (-normval) + else: + color = 'green' + tone = cell.format.maxtone * normval + + return f'\cellcolor{{{color}!{int(tone)}}}' + + + +class Table: + + def __init__(self, + name, + benchmarks=None, + methods=None, + format:CellFormat=None, + lower_is_better=True, + stat_test='wilcoxon', + color_mode='local', + with_mean=True + ): + self.name = name + self.benchmarks = [] if benchmarks is None else benchmarks + self.methods = [] if methods is None else methods + self.format = format if format is not None else CellFormat() + self.lower_is_better = lower_is_better + self.stat_test = stat_test + self.color_mode = color_mode + self.with_mean = with_mean + self.only_full_mean = True # if False, compute the mean of partially empty methods also + + if self.color_mode == 'global': + self.color_global_min = 0 + self.color_global_max = 1 + else: + self.color_global_min = None + self.color_global_max = None + + self.T = {} + self.groups = {} + + def add(self, benchmark, method, v): + cell = self.get(benchmark, method) + cell.append(v) + + def get_benchmarks(self): + return self.benchmarks + + def get_methods(self): + return self.methods + + def n_benchmarks(self): + return len(self.benchmarks) + + def n_methods(self): + return len(self.methods) + + def _new_group(self): + return CellGroup(self.lower_is_better, self.stat_test, color_mode=self.color_mode, + color_global_max=self.color_global_max, color_global_min=self.color_global_min) + + def get(self, benchmark, method) -> Cell: + if benchmark not in self.benchmarks: + self.benchmarks.append(benchmark) + if benchmark not in self.groups: + self.groups[benchmark] = self._new_group() + if method not in self.methods: + self.methods.append(method) + b_idx = self.benchmarks.index(benchmark) + m_idx = self.methods.index(method) + idx = tuple((b_idx, m_idx)) + if idx not in self.T: + self.T[idx] = Cell(self.format, group=self.groups[benchmark]) + cell = self.T[idx] + return cell + + def get_value(self, benchmark, method) -> float: + return self.get(benchmark, method).mean() + + def get_benchmark(self, benchmark): + cells = [self.get(benchmark, method=m) for m in self.get_methods()] + cells = [c for c in cells if not c.isEmpty()] + return cells + + def get_method(self, method): + cells = [self.get(benchmark=b, method=method) for b in self.get_benchmarks()] + cells = [c for c in cells if not c.isEmpty()] + return cells + + def get_method_means(self, method_order): + mean_group = self._new_group() + cells = [] + for method in method_order: + method_mean = Cell(self.format, group=mean_group) + for bench in self.get_benchmarks(): + mean_value = self.get_value(benchmark=bench, method=method) + if not np.isnan(mean_value): + method_mean.append(mean_value) + cells.append(method_mean) + return cells + + def get_benchmark_values(self, benchmark): + values = np.asarray([c.mean() for c in self.get_benchmark(benchmark)]) + return values + + def get_method_values(self, method): + values = np.asarray([c.mean() for c in self.get_method(method)]) + return values + + def all_mean(self): + values = [c.mean() for c in self.T.values() if not c.isEmpty()] + return np.mean(values) + + def print(self): # todo: missing method names? + data_dict = {} + data_dict['Benchmark'] = [b for b in self.get_benchmarks()] + for method in self.get_methods(): + data_dict[method] = [self.get(bench, method).print_mean() for bench in self.get_benchmarks()] + df = pd.DataFrame(data_dict) + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + print(df.to_string(index=False)) + + def tabular(self, path=None, benchmark_replace=None, method_replace=None, benchmark_order=None, method_order=None, transpose=False): + if benchmark_replace is None: + benchmark_replace = {} + if method_replace is None: + method_replace = {} + if benchmark_order is None: + benchmark_order = self.get_benchmarks() + if method_order is None: + method_order = self.get_methods() + + if transpose: + row_order, row_replace = method_order, method_replace + col_order, col_replace = benchmark_order, benchmark_replace + else: + row_order, row_replace = benchmark_order, benchmark_replace + col_order, col_replace = method_order, method_replace + + n_cols = len(col_order) + add_mean_col = self.with_mean and transpose + add_mean_row = self.with_mean and not transpose + last_col_idx = n_cols+2 if add_mean_col else n_cols+1 + + if self.with_mean: + mean_cells = self.get_method_means(method_order) + + lines = [] + lines.append('\\begin{tabular}{|c' + '|c' * n_cols + ('||c' if add_mean_col else '') + "|}") + + lines.append(f'\\cline{{2-{last_col_idx}}}') + l = '\multicolumn{1}{c|}{} & ' + l += ' & '.join([col_replace.get(col, col) for col in col_order]) + if add_mean_col: + l += ' & Ave.' + l += ' \\\\\\hline' + lines.append(l) + + for i, row in enumerate(row_order): + rowname = row_replace.get(row, row) + l = rowname + ' & ' + l += ' & '.join([ + self.get(benchmark=col if transpose else row, method=row if transpose else col).print() + for col in col_order + ]) + if add_mean_col: + l+= ' & ' + mean_cells[i].print() + l += ' \\\\\\hline' + lines.append(l) + + if add_mean_row: + lines.append('\hline') + l = 'Ave. & ' + l+= ' & '.join([mean_cell.print() for mean_cell in mean_cells]) + l += ' \\\\\\hline' + lines.append(l) + + lines.append('\\end{tabular}') + + tabular_tex = '\n'.join(lines) + + if path is not None: + parent = Path(path).parent + if parent: + os.makedirs(parent, exist_ok=True) + with open(path, 'wt') as foo: + foo.write(tabular_tex) + + return tabular_tex + + def table(self, tabular_path, benchmark_replace=None, method_replace=None, resizebox=True, caption=None, label=None, benchmark_order=None, method_order=None, transpose=False): + if benchmark_replace is None: + benchmark_replace = {} + if method_replace is None: + method_replace = {} + + lines = [] + lines.append('\\begin{table}') + lines.append('\center') + if resizebox: + lines.append('\\resizebox{\\textwidth}{!}{%') + + tabular_str = self.tabular(tabular_path, benchmark_replace, method_replace, benchmark_order, method_order, transpose) + if tabular_path is None: + lines.append(tabular_str) + else: + lines.append(f'\input{{tables/{Path(tabular_path).name}}}') + + if resizebox: + lines.append('}%') + if caption is None: + caption = tabular_path.replace('_', '\_') + lines.append(f'\caption{{{caption}}}') + if label is not None: + lines.append(f'\label{{{label}}}') + lines.append('\end{table}') + + table_tex = '\n'.join(lines) + + return table_tex + + def document(self, tex_path, tabular_dir='tables', *args, **kwargs): + Table.Document(tex_path, tables=[self], tabular_dir=tabular_dir, *args, **kwargs) + + def latexPDF(self, pdf_path, tabular_dir='tables', *args, **kwargs): + return Table.LatexPDF(pdf_path, tables=[self], tabular_dir=tabular_dir, *args, **kwargs) + + @classmethod + def Document(self, tex_path, tables:List['Table'], tabular_dir='tables', *args, **kwargs): + lines = [] + lines.append('\\documentclass[10pt,a4paper]{article}') + lines.append('\\usepackage[utf8]{inputenc}') + lines.append('\\usepackage{amsmath}') + lines.append('\\usepackage{amsfonts}') + lines.append('\\usepackage{amssymb}') + lines.append('\\usepackage{graphicx}') + lines.append('\\usepackage{xcolor}') + lines.append('\\usepackage{colortbl}') + lines.append('') + lines.append('\\begin{document}') + for table in tables: + lines.append('') + lines.append(table.table(os.path.join(Path(tex_path).parent, tabular_dir, table.name + '_table.tex'), *args, **kwargs)) + lines.append('\\end{document}') + + document = '\n'.join(lines) + + parent = Path(tex_path).parent + if parent: + os.makedirs(parent, exist_ok=True) + with open(tex_path, 'wt') as foo: + foo.write(document) + + return document + + @classmethod + def LatexPDF(cls, pdf_path: str, tables:List['Table'], tabular_dir: str = 'tables', *args, **kwargs): + assert pdf_path.endswith('.pdf'), f'{pdf_path=} does not seem a valid name for a pdf file' + tex_path = pdf_path.replace('.pdf', '.tex') + + cls.Document(tex_path, tables, tabular_dir, *args, **kwargs) + + dir = Path(pdf_path).parent + pwd = os.getcwd() + + print('currently in', pwd) + print("[Tables Done] runing latex") + os.chdir(dir) + os.system('pdflatex ' + Path(tex_path).name) + basename = Path(tex_path).name.replace('.tex', '') + os.system(f'rm {basename}.aux {basename}.bbl {basename}.blg {basename}.log {basename}.out {basename}.dvi') + os.chdir(pwd) \ No newline at end of file diff --git a/Census/tmp.py b/Census/tmp.py new file mode 100644 index 0000000..041730b --- /dev/null +++ b/Census/tmp.py @@ -0,0 +1,16 @@ +import numpy as np +import pandas as pd + +from Census.commons import AdjMatrix, load_csv, get_dataset_by_area + +census = './data/cens_y.csv' +Areas, X = load_csv(census, use_yhat=True) +data = get_dataset_by_area(Areas, X) + +areas = [a for a, *_ in data] + +print(f'Area codes={areas}') + +A = AdjMatrix('./data/matrice_adiacenza.csv') +print(A.adjacent(45, 46)) +print(A.get_adjacent(50)) \ No newline at end of file