some experiments run, not much to say though

2024-03-27 16:43:28 +01:00 · 2024-03-27 16:43:28 +01:00 · 1f3b1597dc
parent 7ee224521a
commit 1f3b1597dc
12 changed files with 1313 additions and 61 deletions
--- a/Census/adjacentconcat_4.py
+++ b/Census/adjacentconcat_4.py
@ -0,0 +1,87 @@
+import numpy as np
+from sklearn.linear_model import LogisticRegressionCV
+
+from quapy.data import LabelledCollection
+from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
+from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ
+from commons import *
+from table import Table
+from tqdm import tqdm
+import quapy as qp
+
+
+np.set_printoptions(linewidth=np.inf)
+
+def classifier():
+    return LogisticRegressionCV()
+
+def quantifiers():
+    cls = classifier()
+    yield 'MLPE', MLPE()
+    yield 'CC', CC(cls)
+    yield 'PCC', PCC(cls)
+    yield 'ACC', ACC(cls)
+    yield 'PACC', PACC(cls)
+    yield 'SLD', EMQ(cls)
+
+
+survey_y = './data/survey_y.csv'
+
+Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
+
+preprocessor = Preprocessor()
+Xtr = preprocessor.fit_transform(Xtr)
+
+data = get_dataset_by_area(Atr, Xtr, ytr)
+n_areas = len(data)
+
+Madj = AdjMatrix('./data/matrice_adiacenza.csv')
+
+areas = [Ai for Ai, _, _ in data]
+q_names = [q_name for q_name, _ in quantifiers()]
+
+# tables = []
+text_outputs = []
+
+benchmarks  = [f'te-{Ai}' for Ai in areas]  # areas used as test
+
+# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
+# has been trained on all areas but 46
+methods     = [f'{q_name}-cat' for q_name in q_names]
+
+table = Table(name='adjacentconcat', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
+table.format.mean_prec = 4
+table.format.show_std = False
+table.format.sta = False
+table.format.remove_zero = True
+
+for q_name, q in quantifiers():
+    for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
+        #training
+        trainings = [LabelledCollection(Xj, yj) for Aj, Xj, yj in data if Aj!=Ai and Aj in Madj.get_adjacent(Ai)]
+        print(f'for test Ai={Ai} there should be {Madj.get_adjacent(Ai)}: len={len(trainings)}')
+        tr = LabelledCollection.join(*trainings)
+        q.fit(tr)
+
+        #test
+        te = LabelledCollection(Xi, yi)
+        qp.environ["SAMPLE_SIZE"] = len(te)
+        pred_prev = q.quantify(te.X)
+        true_prev = te.prevalence()
+        err = qp.error.mae(true_prev, pred_prev)
+
+        method_name = f'{q_name}-cat'
+        table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
+
+    # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
+
+
+Table.LatexPDF(f'./results/adjacentconcat/doc.pdf', [table])
+
+# with open(f'./results/classifier/output.txt', 'tw') as foo:
+#     foo.write('\n'.join(text_outputs))
+
+
+
+
+
--- a/Census/adjacentmedian_4.1.py
+++ b/Census/adjacentmedian_4.1.py
@ -0,0 +1,101 @@
+import numpy as np
+from sklearn.linear_model import LogisticRegressionCV
+
+from Census.methods import AreaQuantifier, AggregationRule
+from quapy.data import LabelledCollection
+from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
+from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ, MS, MS2
+from commons import *
+from table import Table
+from tqdm import tqdm
+import quapy as qp
+from copy import deepcopy
+
+
+np.set_printoptions(linewidth=np.inf)
+
+def classifier():
+    return LogisticRegressionCV()
+
+def quantifiers():
+    cls = classifier()
+    yield 'MLPE', MLPE()
+    yield 'CC', CC(cls)
+    yield 'PCC', PCC(cls)
+    yield 'ACC', ACC(cls)
+    yield 'PACC', PACC(cls)
+    yield 'MS', MS(cls)
+    # yield 'MS2', MS2(cls)
+    # yield 'SLD', EMQ(cls)
+
+
+survey_y = './data/survey_y.csv'
+
+Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
+
+preprocessor = Preprocessor()
+Xtr = preprocessor.fit_transform(Xtr)
+
+data = get_dataset_by_area(Atr, Xtr, ytr)
+n_areas = len(data)
+
+areas = [Ai for Ai, _, _ in data]
+q_names = [q_name for q_name, _ in quantifiers()]
+
+Madj = AdjMatrix('./data/matrice_adiacenza.csv')
+
+tables = []
+text_outputs = []
+
+benchmarks  = [f'te-{Ai}' for Ai in areas]  # areas used as test
+
+for aggr in ['median', 'mean']:
+
+    # areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
+    # has been trained on all areas but 46
+    methods     = [f'{q_name}-{aggr}' for q_name in q_names]
+
+    table = Table(name=f'adjacent{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
+    table.format.mean_prec = 4
+    table.format.show_std = False
+    table.format.sta = False
+    table.format.remove_zero = True
+
+
+    for q_name, q in quantifiers():
+        # pretrain quantifiers per area
+        pretrained_area_q = []
+        for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
+            q_i = deepcopy(q)
+            q_i.fit(LabelledCollection(Xi, yi))
+            pretrained_area_q.append(AreaQuantifier(Ai, q_i))
+
+        for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
+            # compose members of the rule (quantifiers are already fit)
+            #training
+            area_quantifiers = [qA_j for qA_j in pretrained_area_q if qA_j.area != Ai]
+            rule = AggregationRule(area_quantifiers, adjacent_matrix=Madj, aggr=aggr)
+
+            #test
+            te = LabelledCollection(Xi, yi)
+            qp.environ["SAMPLE_SIZE"] = len(te)
+            pred_prev = rule.predict(Ai, te.X)
+            true_prev = te.prevalence()
+            err = qp.error.mae(true_prev, pred_prev)
+
+            method_name = f'{q_name}-{aggr}'
+            table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
+
+        # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
+
+    tables.append(table)
+
+Table.LatexPDF(f'./results/adjacentaggregation/doc.pdf', tables)
+
+# with open(f'./results/classifier/output.txt', 'tw') as foo:
+#     foo.write('\n'.join(text_outputs))
+
+
+
+
+
--- a/Census/adjacentmedianoptim_4.2.py
+++ b/Census/adjacentmedianoptim_4.2.py
@ -0,0 +1,95 @@
+import numpy as np
+from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
+
+from Census.methods import AreaQuantifier, AggregationRule, optimize_ensemble
+from quapy.data import LabelledCollection
+from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
+from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ, MS, MS2
+from commons import *
+from table import Table
+from tqdm import tqdm
+import quapy as qp
+from copy import deepcopy
+
+
+np.set_printoptions(linewidth=np.inf)
+
+def classifier():
+    return LogisticRegression()
+
+def quantifiers():
+    cls = classifier()
+    # yield 'MLPE', MLPE()
+    yield 'CC', CC(cls)
+    yield 'PCC', PCC(cls)
+    yield 'ACC', ACC(cls)
+    yield 'PACC', PACC(cls)
+    yield 'MS', MS(cls)
+    # yield 'MS2', MS2(cls)
+    # yield 'SLD', EMQ(cls)
+
+
+survey_y = './data/survey_y.csv'
+
+Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
+
+preprocessor = Preprocessor()
+Xtr = preprocessor.fit_transform(Xtr)
+
+data = get_dataset_by_area(Atr, Xtr, ytr)
+n_areas = len(data)
+
+areas = [Ai for Ai, _, _ in data]
+q_names = [q_name for q_name, _ in quantifiers()]
+
+Madj = AdjMatrix('./data/matrice_adiacenza.csv')
+
+tables = []
+text_outputs = []
+
+benchmarks  = [f'te-{Ai}' for Ai in areas]  # areas used as test
+
+for aggr in ['median', 'mean']:
+
+    # areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
+    # has been trained on all areas but 46
+    methods     = [f'{q_name}-{aggr}' for q_name in q_names]
+
+    table = Table(name=f'adjacent{aggr}optim', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
+    table.format.mean_prec = 4
+    table.format.show_std = False
+    table.format.sta = False
+    table.format.remove_zero = True
+
+
+    for q_name, q in quantifiers():
+        for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
+            # compose members of the rule (quantifiers are optimized wrt the rest of the areas)
+            #training
+            other_area = [(Aj, Xj, yj) for Aj, Xj, yj in data if Aj != Ai]
+            area_quantifiers = optimize_ensemble(other_area, q, Madj)
+            rule = AggregationRule(area_quantifiers, adjacent_matrix=Madj, aggr=aggr)
+
+            #test
+            te = LabelledCollection(Xi, yi)
+            qp.environ["SAMPLE_SIZE"] = len(te)
+            pred_prev = rule.predict(Ai, te.X)
+            true_prev = te.prevalence()
+            err = qp.error.mae(true_prev, pred_prev)
+
+            method_name = f'{q_name}-{aggr}'
+            table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
+
+        # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
+
+    tables.append(table)
+
+Table.LatexPDF(f'./results/adjacentaggregationoptim/doc.pdf', tables)
+
+# with open(f'./results/classifier/output.txt', 'tw') as foo:
+#     foo.write('\n'.join(text_outputs))
+
+
+
+
+
--- a/Census/allconcat_3.py
+++ b/Census/allconcat_3.py
@ -0,0 +1,84 @@
+import numpy as np
+from sklearn.linear_model import LogisticRegressionCV
+
+from quapy.data import LabelledCollection
+from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
+from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ
+from commons import *
+from table import Table
+from tqdm import tqdm
+import quapy as qp
+
+
+np.set_printoptions(linewidth=np.inf)
+
+def classifier():
+    return LogisticRegressionCV()
+
+def quantifiers():
+    cls = classifier()
+    yield 'MLPE', MLPE()
+    yield 'CC', CC(cls)
+    yield 'PCC', PCC(cls)
+    yield 'ACC', ACC(cls)
+    yield 'PACC', PACC(cls)
+    yield 'SLD', EMQ(cls)
+
+
+survey_y = './data/survey_y.csv'
+
+Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
+
+preprocessor = Preprocessor()
+Xtr = preprocessor.fit_transform(Xtr)
+
+data = get_dataset_by_area(Atr, Xtr, ytr)
+n_areas = len(data)
+
+areas = [Ai for Ai, _, _ in data]
+q_names = [q_name for q_name, _ in quantifiers()]
+
+# tables = []
+text_outputs = []
+
+benchmarks  = [f'te-{Ai}' for Ai in areas]  # areas used as test
+
+# areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
+# has been trained on all areas but 46
+methods     = [f'{q_name}-cat' for q_name in q_names]
+
+table = Table(name='allconcat', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
+table.format.mean_prec = 4
+table.format.show_std = False
+table.format.sta = False
+table.format.remove_zero = True
+
+for q_name, q in quantifiers():
+    for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
+        #training
+        trainings = [LabelledCollection(Xj, yj) for Aj, Xj, yj in data if Aj!=Ai]
+        tr = LabelledCollection.join(*trainings)
+        q.fit(tr)
+
+        #test
+        te = LabelledCollection(Xi, yi)
+        qp.environ["SAMPLE_SIZE"] = len(te)
+        pred_prev = q.quantify(te.X)
+        true_prev = te.prevalence()
+        err = qp.error.mae(true_prev, pred_prev)
+
+        method_name = f'{q_name}-cat'
+        table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
+
+    # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
+
+
+Table.LatexPDF(f'./results/allconcat/doc.pdf', [table])
+
+# with open(f'./results/classifier/output.txt', 'tw') as foo:
+#     foo.write('\n'.join(text_outputs))
+
+
+
+
+
--- a/Census/allmedian_3.1.py
+++ b/Census/allmedian_3.1.py
@ -0,0 +1,96 @@
+import numpy as np
+from sklearn.linear_model import LogisticRegressionCV
+
+from Census.methods import AreaQuantifier, AggregationRule
+from quapy.data import LabelledCollection
+from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
+from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ
+from commons import *
+from table import Table
+from tqdm import tqdm
+import quapy as qp
+from copy import deepcopy
+
+
+np.set_printoptions(linewidth=np.inf)
+
+def classifier():
+    return LogisticRegressionCV()
+
+def quantifiers():
+    cls = classifier()
+    yield 'MLPE', MLPE()
+    yield 'CC', CC(cls)
+    yield 'PCC', PCC(cls)
+    yield 'ACC', ACC(cls)
+    yield 'PACC', PACC(cls)
+    yield 'SLD', EMQ(cls)
+
+
+survey_y = './data/survey_y.csv'
+
+Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
+
+preprocessor = Preprocessor()
+Xtr = preprocessor.fit_transform(Xtr)
+
+data = get_dataset_by_area(Atr, Xtr, ytr)
+n_areas = len(data)
+
+areas = [Ai for Ai, _, _ in data]
+q_names = [q_name for q_name, _ in quantifiers()]
+
+tables = []
+text_outputs = []
+
+benchmarks  = [f'te-{Ai}' for Ai in areas]  # areas used as test
+
+for aggr in ['median', 'mean']:
+
+    # areas on which a quantifier is trained, e.g., 'PACC-w/o46' means a PACC quantifier
+    # has been trained on all areas but 46
+    methods     = [f'{q_name}-{aggr}' for q_name in q_names]
+
+    table = Table(name=f'all{aggr}', benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local')
+    table.format.mean_prec = 4
+    table.format.show_std = False
+    table.format.sta = False
+    table.format.remove_zero = True
+
+    for q_name, q in quantifiers():
+        # pretrain quantifiers per area
+        pretrained_area_q = []
+        for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
+            q_i = deepcopy(q)
+            q_i.fit(LabelledCollection(Xi, yi))
+            pretrained_area_q.append(AreaQuantifier(Ai, q_i))
+
+        for i, (Ai, Xi, yi) in tqdm(enumerate(data), total=n_areas):
+            # compose members of the rule (quantifiers are already fit)
+            #training
+            area_quantifiers = [qA_j for qA_j in pretrained_area_q if qA_j.area != Ai]
+            rule = AggregationRule(area_quantifiers, aggr=aggr)
+
+            #test
+            te = LabelledCollection(Xi, yi)
+            qp.environ["SAMPLE_SIZE"] = len(te)
+            pred_prev = rule.predict(Ai, te.X)
+            true_prev = te.prevalence()
+            err = qp.error.mae(true_prev, pred_prev)
+
+            method_name = f'{q_name}-{aggr}'
+            table.add(benchmark=f'te-{Ai}', method=method_name, v=err)
+
+        # text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
+
+    tables.append(table)
+
+Table.LatexPDF(f'./results/allaggregation/doc.pdf', tables)
+
+    # with open(f'./results/classifier/output.txt', 'tw') as foo:
+    #     foo.write('\n'.join(text_outputs))
+
+
+
+
+
--- a/Census/classification_accuracy_1.py
+++ b/Census/classification_accuracy_1.py
@ -0,0 +1,70 @@
+import numpy as np
+from sklearn.svm import SVC
+
+from commons import *
+from table import Table
+
+
+np.set_printoptions(linewidth=np.inf)
+
+def classifiers():
+    yield 'LR-opt', LogisticRegressionCV(class_weight='balanced', Cs=10)
+    yield 'LR-def', LogisticRegressionCV()
+    yield 'SVM-linear', LinearSVC()
+    yield 'SVM-rbf', SVC(kernel='rbf')
+
+
+survey_y = './data/survey_y.csv'
+
+Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
+
+preprocessor = Preprocessor()
+Xtr = preprocessor.fit_transform(Xtr)
+
+trains = get_dataset_by_area(Atr, Xtr, ytr)
+n_areas = len(trains)
+
+areas = [Ai for Ai, _, _ in trains]
+
+tables = []
+text_outputs = []
+
+benchmarks  = [f'te-{Ai}' for Ai in areas]  # areas used as test
+methods     = [f'tr-{Ai}' for Ai in areas]  # areas on which a quantifier is trained
+
+for cls_name, c in classifiers():
+
+    table = Table(name=cls_name, benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='local', lower_is_better=False)
+    table.format.mean_prec = 4
+    table.format.show_std = False
+    table.format.sta = False
+    table.format.remove_zero = True
+
+    for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_areas):
+        c.fit(Xi, yi)
+        for j, (Aj, Xj, yj) in enumerate(trains):
+            if i==j: continue
+            pred_labels = c.predict(Xj)
+            true_labels = yj
+            acc = (pred_labels==true_labels).mean()
+            table.add(benchmark=f'te-{Aj}', method=f'tr-{Ai}', v=acc)
+
+    for test in benchmarks:
+        values = table.get_benchmark_values(test)
+        table.add(benchmark=test, method='Best', v=max(values))
+        table.add(benchmark=test, method='Worst', v=min(values))
+        table.add(benchmark=test, method='AVE', v=np.mean(values))
+
+    tables.append(table)
+
+    text_outputs.append(f'{cls_name} got mean {table.all_mean():.5f}')
+
+
+Table.LatexPDF(f'./results/classifier/doc.pdf', tables)
+with open(f'./results/classifier/output.txt', 'tw') as foo:
+    foo.write('\n'.join(text_outputs))
+
+
+
+
+
--- a/Census/commons.py
+++ b/Census/commons.py
@ -0,0 +1,90 @@
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+
+np.set_printoptions(linewidth=np.inf)
+
+
+def load_csv(file, use_yhat=True):
+    df = pd.read_csv(file)
+
+    cod_area = 'cod.prov'
+    if use_yhat:
+        covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
+    else:
+        covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
+    y_true = 'y.true'
+
+    X = df[covariates].values
+    A = df[cod_area].values
+
+    # for i, cov in enumerate(covariates):
+    #     print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
+
+    if y_true in df.columns:
+        y = df[y_true].values
+        return A, X, y
+    else:
+        return A, X
+
+
+def get_dataset_by_area(A, X, y=None):
+    data = []
+    for area in np.unique(A):
+        sel = (A == area)
+        Xsel = X[sel]
+        if y is not None:
+            ysel = y[sel]
+        else:
+            ysel = None
+        data.append((area, Xsel, ysel))
+    return data
+
+
+class AdjMatrix:
+
+    def __init__(self, path):
+        df = pd.read_csv(path)
+
+        area_codes = df.columns[1:].values
+        area_codes = np.asarray([int(c) for c in area_codes])
+
+        values = df.values[:, 1:]
+        print(area_codes)
+        print(values)
+        self.area2idx = {area:i for i, area in enumerate(area_codes)}
+        self.idx2area = area_codes
+        self.M = np.asarray(values)
+
+    def adjacent(self, cod_1, cod_2):
+        idx1 = self.area2idx[cod_1]
+        idx2 = self.area2idx[cod_2]
+        return (self.M[idx1, idx2] == 1)
+
+    def get_adjacent(self, cod):
+        idx = self.area2idx[cod]
+        idx_adj = np.argwhere(self.M[idx]==1).flatten()
+        return self.idx2area[idx_adj]
+
+
+class Preprocessor:
+    def __init__(self):
+        self.scaler = StandardScaler()
+        # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
+        self.standardize_col_ids = np.arange(8) # everything
+
+    def fit(self, X, y=None):
+        Xsel = X[:, self.standardize_col_ids]
+        self.scaler.fit(Xsel)
+        return self
+
+    def transform(self, X):
+        Xsel = X[:, self.standardize_col_ids]
+        Xsel_zscore = self.scaler.transform(Xsel)
+        X[:, self.standardize_col_ids] = Xsel_zscore
+        return X
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X, y).transform(X)
+
+
--- a/Census/main.py
+++ b/Census/main.py
@ -9,6 +9,7 @@ from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation a
 from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS, ACC
 from quapy.data import LabelledCollection
 from sklearn.preprocessing import StandardScaler
+from commons import *

 np.set_printoptions(linewidth=np.inf)

@ -16,67 +17,6 @@ np.set_printoptions(linewidth=np.inf)
 cens_y = './data/cens_y.csv'
 survey_y = './data/survey_y.csv'

-
-def load_csv(file, use_yhat=True):
-    df = pd.read_csv(file)
-
-    cod_area = 'cod.prov'
-    if use_yhat:
-        covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'y.hat', 'prob']
-    else:
-        covariates = ['owner', 'eta', 'work', 'sex', 'year_edu', 'hsize', 'prob']
-    y_true = 'y.true'
-
-    X = df[covariates].values
-    A = df[cod_area].values
-
-    for i, cov in enumerate(covariates):
-        print(f'values of col {i} "{cov}" {np.unique(X[:,i])}')
-
-    if y_true in df.columns:
-        y = df[y_true].values
-        return A, X, y
-    else:
-        return A, X
-
-
-def get_dataset_by_area(A, X, y=None):
-    lc = []
-    for area in np.unique(A):
-        sel = (A == area)
-        Xsel = X[sel]
-        if y is not None:
-            ysel = y[sel]
-        else:
-            ysel = None
-        lc.append((area, Xsel, ysel))
-    return lc
-
-
-class Preprocessor:
-    def __init__(self):
-        self.scaler = StandardScaler()
-        # self.standardize_col_ids = np.asarray([1, 4, 5]) # eta, year_edu, hsize
-        self.standardize_col_ids = np.arange(8) # everything
-
-    def fit(self, X, y=None):
-        Xsel = X[:, self.standardize_col_ids]
-        self.scaler.fit(Xsel)
-        return self
-
-    def transform(self, X):
-        Xsel = X[:, self.standardize_col_ids]
-        Xsel_zscore = self.scaler.transform(Xsel)
-        X[:, self.standardize_col_ids] = Xsel_zscore
-        return X
-
-    def fit_transform(self, X, y=None):
-        return self.fit(X, y).transform(X)
-
-
-
-
-
 # Ate, Xte = load_csv(cens_y)
 Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)

--- a/Census/methods.py
+++ b/Census/methods.py
@ -0,0 +1,111 @@
+from abc import abstractmethod, ABC
+from copy import deepcopy
+from typing import List, Iterable
+
+import numpy as np
+
+import quapy as qp
+from quapy.method.aggregative import AggregativeQuantifier
+from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
+from quapy.data import LabelledCollection
+from quapy.method.base import BaseQuantifier
+
+
+class AreaQuantifier:
+    def __init__(self, area:int, quantifier: BaseQuantifier):
+        self.area = area
+        self.quantifier = quantifier
+
+    def quantify(self, X):
+        return self.quantifier.quantify(X)
+
+
+class CombinationRule(ABC):
+
+    def __init__(self, area_quantifiers: List[AreaQuantifier]):
+        self.area_quantifiers = area_quantifiers
+
+    @abstractmethod
+    def select_quantifiers(self, area:int, X):
+        ...
+
+    @abstractmethod
+    def combination(self, choice, X):
+        ...
+
+    def predict(self, area:int, X):
+        choice = self.select_quantifiers(area, X)
+        prevalence = self.combination(choice, X)
+        return prevalence
+
+
+
+def optimize_ensemble(area_data: Iterable, q: BaseQuantifier, Madj=None, hyper=None, error='mae'):
+    if hyper is None:
+        hyper = {
+            'classifier__C': np.logspace(-4, 4, 9),
+            'classifier__class_weight': ['balanced', None]
+        }
+
+    labelled_collections = [(A, LabelledCollection(X, y)) for A, X, y in area_data]
+
+    area_quantifiers = []
+    for A, lc in labelled_collections:
+        if Madj is None:
+            rest = [lc_j for Aj, lc_j in labelled_collections if Aj != A]
+        else:
+            rest = [lc_j for Aj, lc_j in labelled_collections if Aj != A and Aj in Madj.get_adjacent(A)]
+        q = optim(q, lc, rest, hyper, error)
+        area_quantifiers.append(AreaQuantifier(A, q))
+
+    return area_quantifiers
+
+
+class AggregationRule(CombinationRule):
+
+    def __init__(self, area_quantifiers: List[AreaQuantifier], adjacent_matrix: 'AdjMatrix' = None, aggr='median'):
+        assert aggr in ['mean', 'median'], f'unknown {aggr=}'
+        self.area_quantifiers = area_quantifiers
+        self.adjacent_matrix = adjacent_matrix
+        self.aggr = aggr
+
+    def select_quantifiers(self, area:int, X):
+        if self.adjacent_matrix is None:
+            chosen = self.area_quantifiers
+        else:
+            adjacent = self.adjacent_matrix.get_adjacent(area)
+            chosen = [q_i for q_i in self.area_quantifiers if q_i.area in adjacent]
+        return chosen
+
+    def combination(self, choice, X):
+        prevs = np.asarray([q.quantify(X) for q in choice])
+        if self.aggr == 'median':
+            prev = np.median(prevs, axis=0)
+        elif self.aggr == 'mean':
+            prev = np.mean(prevs, axis=0)
+        else:
+            raise NotImplementedError(f'{self.aggr=} not implemented')
+        return prev
+
+
+def optim(q: BaseQuantifier, train: LabelledCollection, labelled_collections: Iterable[LabelledCollection], hyper:dict, error='mae'):
+    q = deepcopy(q)
+
+    prot = qp.protocol.IterateProtocol(labelled_collections)
+    try:
+        mod_sel = qp.model_selection.GridSearchQ(
+            model=q,
+            param_grid=hyper,
+            protocol=prot,
+            error=error,
+            refit=False,
+            n_jobs=-1
+        ).fit(train)
+
+        fitted = mod_sel.best_model_
+    except ValueError:
+        print(f'method {q} failed; training without model selection')
+        fitted = q.fit(train)
+
+    return fitted
+
--- a/Census/pairwise_2.py
+++ b/Census/pairwise_2.py
@ -0,0 +1,86 @@
+import numpy as np
+from sklearn.linear_model import LogisticRegressionCV
+
+from quapy.data import LabelledCollection
+from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
+from quapy.method.aggregative import CC, PCC, ACC, PACC, EMQ
+from commons import *
+from table import Table
+from tqdm import tqdm
+import quapy as qp
+
+np.set_printoptions(linewidth=np.inf)
+
+def classifier():
+    #return LogisticRegressionCV(class_weight='balanced', Cs=10)
+    return LogisticRegressionCV()
+
+def quantifiers():
+    cls = classifier()
+    yield 'MLPE', MLPE()
+    yield 'CC', CC(cls)
+    yield 'PCC', PCC(cls)
+    yield 'ACC', ACC(cls)
+    yield 'PACC', PACC(cls)
+
+
+survey_y = './data/survey_y.csv'
+
+Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True)
+
+preprocessor = Preprocessor()
+Xtr = preprocessor.fit_transform(Xtr)
+
+trains = get_dataset_by_area(Atr, Xtr, ytr)
+n_areas = len(trains)
+
+areas = [Ai for Ai, _, _ in trains]
+
+tables = []
+text_outputs = []
+
+benchmarks  = [f'te-{Ai}' for Ai in areas]  # areas used as test
+methods     = [f'tr-{Ai}' for Ai in areas]  # areas on which a quantifier is trained
+
+
+for q_name, q in quantifiers():
+
+    table = Table(name=q_name, benchmarks=benchmarks, methods=methods, stat_test=None, color_mode='global')
+    table.format.mean_prec = 4
+    table.format.show_std = False
+    table.format.sta = False
+    table.format.remove_zero = True
+    table.with_mean = True
+
+    for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_areas):
+        tr = LabelledCollection(Xi, yi)
+        q.fit(tr)
+        len_tr = len(tr)
+        for j, (Aj, Xj, yj) in enumerate(trains):
+            if i==j: continue
+            te = LabelledCollection(Xj, yj)
+            qp.environ["SAMPLE_SIZE"] = len(te)
+            pred_prev = q.quantify(te.X)
+            true_prev = te.prevalence()
+            # err = qp.error.mrae(true_prev, pred_prev)
+            err = qp.error.mae(true_prev, pred_prev)
+            table.add(benchmark=f'te-{Aj}', method=f'tr-{Ai}', v=err)
+
+    for test in benchmarks:
+        values = table.get_benchmark_values(test)
+        table.add(benchmark=test, method='Best', v=min(values))
+
+    tables.append(table)
+
+    text_outputs.append(f'{q_name} got mean {table.all_mean():.5f}, best mean {table.get_method_values("Best").mean():.5f}')
+
+
+Table.LatexPDF(f'./results/pairwise/doc.pdf', tables)
+
+with open(f'./results/classifier/output.txt', 'tw') as foo:
+    foo.write('\n'.join(text_outputs))
+
+
+
+
+
--- a/Census/table.py
+++ b/Census/table.py
@ -0,0 +1,476 @@
+import numpy as np
+from typing import Union, List
+from collections.abc import Iterable
+from dataclasses import dataclass
+from scipy.stats import wilcoxon, ttest_ind_from_stats
+import pandas as pd
+import os
+from pathlib import Path
+
+
+@dataclass
+class CellFormat:
+    mean_prec: int = 3
+    std_prec: int = 3
+    show_std: bool = True
+    remove_zero: bool = False
+    color: bool = True
+    maxtone: int = 50
+
+
+class Cell:
+
+    def __init__(self, format: CellFormat, group: 'CellGroup'):
+        self.values = []
+        self.format = format
+        self.touch()
+        self.group = group
+        self.group.register_cell(self)
+
+    def __len__(self):
+        return len(self.values)
+
+    def mean(self):
+        if self.mean_ is None:
+            self.mean_ = np.mean(self.values)
+        return self.mean_
+
+    def std(self):
+        if self.std_ is None:
+            self.std_ = np.std(self.values)
+        return self.std_
+
+    def touch(self):
+        self.mean_ = None
+        self.std_ = None
+
+    def append(self, v: Union[float,Iterable]):
+        if isinstance(v, Iterable):
+            self.values.extend(v)
+        self.values.append(v)
+        self.touch()
+
+    def isEmpty(self):
+        return len(self)==0
+
+    def isBest(self):
+        best = self.group.best()
+        if best is not None:
+            return (best == self) or (np.isclose(best.mean(), self.mean()))
+        return False
+
+    def print_mean(self):
+        if self.isEmpty():
+            return ''
+        else:
+            return f'{self.mean():.{self.format.mean_prec}f}'
+
+    def print(self):
+        if self.isEmpty():
+            return ''
+
+        # mean
+        # ---------------------------------------------------
+        mean = self.print_mean()
+        if self.format.remove_zero:
+            mean = mean.replace('0.', '.')
+
+        # std ?
+        # ---------------------------------------------------
+        if self.format.show_std:
+            std  = f' $\pm$ {self.std():.{self.format.std_prec}f}'
+        else:
+            std = ''
+
+        # bold or statistical test
+        # ---------------------------------------------------
+        if self.isBest():
+            str_cell = f'\\textbf{{{mean}{std}}}'
+        else:
+            comp_symbol = ''
+            pval = self.group.compare(self)
+            if pval is not None:
+                if 0.005 > pval:
+                    comp_symbol = ''
+                elif 0.05 > pval >= 0.005:
+                    comp_symbol = '$^{\dag}$'
+                elif pval >= 0.05:
+                    comp_symbol = '${\ddag}$'
+            str_cell = f'{mean}{comp_symbol}{std}'
+
+        # color ?
+        # ---------------------------------------------------
+        if self.format.color:
+            str_cell += ' ' + self.group.color(self)
+
+        return str_cell
+
+
+class CellGroup:
+
+    def __init__(self, lower_is_better=True, stat_test='wilcoxon', color_mode='local', color_global_min=None, color_global_max=None):
+        assert stat_test in ['wilcoxon', 'ttest', None], \
+            f"unknown {stat_test=}, valid ones are wilcoxon, ttest, or None"
+        assert color_mode in ['local', 'global'], \
+            f"unknown {color_mode=}, valid ones are local and global"
+        if (color_global_min is not None or color_global_max is not None) and color_mode=='local':
+            print('warning: color_global_min and color_global_max are only considered when color_mode==local')
+        self.cells = []
+        self.lower_is_better = lower_is_better
+        self.stat_test = stat_test
+        self.color_mode = color_mode
+        self.color_global_min = color_global_min
+        self.color_global_max = color_global_max
+
+    def register_cell(self, cell: Cell):
+        self.cells.append(cell)
+
+    def non_empty_cells(self):
+        return [c for c in self.cells if not c.isEmpty()]
+
+    def max(self):
+        cells = self.non_empty_cells()
+        if len(cells)>0:
+            return cells[np.argmax([c.mean() for c in cells])]
+        return None
+
+    def min(self):
+        cells = self.non_empty_cells()
+        if len(cells) > 0:
+            return cells[np.argmin([c.mean() for c in cells])]
+        return None
+
+    def best(self) -> Cell:
+        return self.min() if self.lower_is_better else self.max()
+
+    def worst(self) -> Cell:
+        return self.max() if self.lower_is_better else self.min()
+
+    def isEmpty(self):
+        return len(self.non_empty_cells())==0
+
+    def compare(self, cell: Cell):
+        best = self.best()
+        best_n = len(best)
+        cell_n = len(cell)
+        if best_n > 0 and cell_n > 0:
+            if self.stat_test == 'wilcoxon':
+                try:
+                    _, p_val = wilcoxon(best.values, cell.values)
+                except ValueError:
+                    p_val = None
+                return p_val
+            elif self.stat_test == 'ttest':
+                best_mean, best_std = best.mean(), best.std()
+                cell_mean, cell_std = cell.mean(), cell.std()
+                _, p_val = ttest_ind_from_stats(best_mean, best_std, best_n, cell_mean, cell_std, cell_n)
+                return p_val
+            elif self.stat_test is None:
+                return None
+            else:
+                raise ValueError(f'unknown statistical test {self.stat_test}')
+        else:
+            return None
+
+    def color(self, cell: Cell):
+        cell_mean = cell.mean()
+
+        if self.color_mode == 'local':
+            best = self.best()
+            worst = self.worst()
+            best_mean = best.mean()
+            worst_mean = worst.mean()
+
+            if best is None or worst is None or best_mean == worst_mean or cell.isEmpty():
+                return ''
+
+            # normalize val in [0,1]
+            maxval = max(best_mean, worst_mean)
+            minval = min(best_mean, worst_mean)
+        else:
+            maxval = self.color_global_max
+            minval = self.color_global_min
+
+        normval = (cell_mean - minval) / (maxval - minval)
+
+        if self.lower_is_better:
+            normval = 1 - normval
+
+        normval = np.clip(normval, 0, 1)
+
+        normval = normval * 2 - 1  # rescale to [-1,1]
+        if normval < 0:
+            color = 'red'
+            tone = cell.format.maxtone * (-normval)
+        else:
+            color = 'green'
+            tone = cell.format.maxtone * normval
+
+        return f'\cellcolor{{{color}!{int(tone)}}}'
+
+
+
+class Table:
+
+    def __init__(self,
+                 name,
+                 benchmarks=None,
+                 methods=None,
+                 format:CellFormat=None,
+                 lower_is_better=True,
+                 stat_test='wilcoxon',
+                 color_mode='local',
+                 with_mean=True
+                 ):
+        self.name = name
+        self.benchmarks = [] if benchmarks is None else benchmarks
+        self.methods = [] if methods is None else methods
+        self.format = format if format is not None else CellFormat()
+        self.lower_is_better = lower_is_better
+        self.stat_test = stat_test
+        self.color_mode = color_mode
+        self.with_mean = with_mean
+        self.only_full_mean = True  # if False, compute the mean of partially empty methods also
+
+        if self.color_mode == 'global':
+            self.color_global_min = 0
+            self.color_global_max = 1
+        else:
+            self.color_global_min = None
+            self.color_global_max = None
+
+        self.T = {}
+        self.groups = {}
+
+    def add(self, benchmark, method, v):
+        cell = self.get(benchmark, method)
+        cell.append(v)
+
+    def get_benchmarks(self):
+        return self.benchmarks
+
+    def get_methods(self):
+        return self.methods
+
+    def n_benchmarks(self):
+        return len(self.benchmarks)
+
+    def n_methods(self):
+        return len(self.methods)
+
+    def _new_group(self):
+        return CellGroup(self.lower_is_better, self.stat_test, color_mode=self.color_mode,
+                  color_global_max=self.color_global_max, color_global_min=self.color_global_min)
+
+    def get(self, benchmark, method) -> Cell:
+        if benchmark not in self.benchmarks:
+            self.benchmarks.append(benchmark)
+        if benchmark not in self.groups:
+            self.groups[benchmark] = self._new_group()
+        if method not in self.methods:
+            self.methods.append(method)
+        b_idx = self.benchmarks.index(benchmark)
+        m_idx = self.methods.index(method)
+        idx = tuple((b_idx, m_idx))
+        if idx not in self.T:
+            self.T[idx] = Cell(self.format, group=self.groups[benchmark])
+        cell = self.T[idx]
+        return cell
+
+    def get_value(self, benchmark, method) -> float:
+        return self.get(benchmark, method).mean()
+
+    def get_benchmark(self, benchmark):
+        cells = [self.get(benchmark, method=m) for m in self.get_methods()]
+        cells = [c for c in cells if not c.isEmpty()]
+        return cells
+
+    def get_method(self, method):
+        cells = [self.get(benchmark=b, method=method) for b in self.get_benchmarks()]
+        cells = [c for c in cells if not c.isEmpty()]
+        return cells
+
+    def get_method_means(self, method_order):
+        mean_group = self._new_group()
+        cells = []
+        for method in method_order:
+            method_mean = Cell(self.format, group=mean_group)
+            for bench in self.get_benchmarks():
+                mean_value = self.get_value(benchmark=bench, method=method)
+                if not np.isnan(mean_value):
+                    method_mean.append(mean_value)
+            cells.append(method_mean)
+        return cells
+
+    def get_benchmark_values(self, benchmark):
+        values = np.asarray([c.mean() for c in self.get_benchmark(benchmark)])
+        return values
+
+    def get_method_values(self, method):
+        values = np.asarray([c.mean() for c in self.get_method(method)])
+        return values
+
+    def all_mean(self):
+        values = [c.mean() for c in self.T.values() if not c.isEmpty()]
+        return np.mean(values)
+
+    def print(self):  # todo: missing method names?
+        data_dict = {}
+        data_dict['Benchmark'] = [b for b in self.get_benchmarks()]
+        for method in self.get_methods():
+            data_dict[method] = [self.get(bench, method).print_mean() for bench in self.get_benchmarks()]
+        df = pd.DataFrame(data_dict)
+        pd.set_option('display.max_columns', None)
+        pd.set_option('display.max_rows', None)
+        print(df.to_string(index=False))
+
+    def tabular(self, path=None, benchmark_replace=None, method_replace=None, benchmark_order=None, method_order=None, transpose=False):
+        if benchmark_replace is None:
+            benchmark_replace = {}
+        if method_replace is None:
+            method_replace = {}
+        if benchmark_order is None:
+            benchmark_order = self.get_benchmarks()
+        if method_order is None:
+            method_order = self.get_methods()
+
+        if transpose:
+            row_order, row_replace = method_order, method_replace
+            col_order, col_replace = benchmark_order, benchmark_replace
+        else:
+            row_order, row_replace = benchmark_order, benchmark_replace
+            col_order, col_replace = method_order, method_replace
+
+        n_cols = len(col_order)
+        add_mean_col = self.with_mean and transpose
+        add_mean_row = self.with_mean and not transpose
+        last_col_idx = n_cols+2 if add_mean_col else n_cols+1
+
+        if self.with_mean:
+            mean_cells = self.get_method_means(method_order)
+
+        lines = []
+        lines.append('\\begin{tabular}{|c' + '|c' * n_cols + ('||c' if add_mean_col else '') + "|}")
+
+        lines.append(f'\\cline{{2-{last_col_idx}}}')
+        l = '\multicolumn{1}{c|}{} & '
+        l += ' & '.join([col_replace.get(col, col) for col in col_order])
+        if add_mean_col:
+            l += ' & Ave.'
+        l += ' \\\\\\hline'
+        lines.append(l)
+
+        for i, row in enumerate(row_order):
+            rowname = row_replace.get(row, row)
+            l = rowname + ' & '
+            l += ' & '.join([
+                self.get(benchmark=col if transpose else row, method=row if transpose else col).print()
+                for col in col_order
+            ])
+            if add_mean_col:
+                l+= ' & ' + mean_cells[i].print()
+            l += ' \\\\\\hline'
+            lines.append(l)
+
+        if add_mean_row:
+            lines.append('\hline')
+            l = 'Ave. & '
+            l+= ' & '.join([mean_cell.print() for mean_cell in mean_cells])
+            l += ' \\\\\\hline'
+            lines.append(l)
+
+        lines.append('\\end{tabular}')
+
+        tabular_tex = '\n'.join(lines)
+
+        if path is not None:
+            parent = Path(path).parent
+            if parent:
+                os.makedirs(parent, exist_ok=True)
+            with open(path, 'wt') as foo:
+                foo.write(tabular_tex)
+
+        return tabular_tex
+
+    def table(self, tabular_path, benchmark_replace=None, method_replace=None, resizebox=True, caption=None, label=None, benchmark_order=None, method_order=None, transpose=False):
+        if benchmark_replace is None:
+            benchmark_replace = {}
+        if method_replace is None:
+            method_replace = {}
+
+        lines = []
+        lines.append('\\begin{table}')
+        lines.append('\center')
+        if resizebox:
+            lines.append('\\resizebox{\\textwidth}{!}{%')
+
+        tabular_str = self.tabular(tabular_path, benchmark_replace, method_replace, benchmark_order, method_order, transpose)
+        if tabular_path is None:
+            lines.append(tabular_str)
+        else:
+            lines.append(f'\input{{tables/{Path(tabular_path).name}}}')
+
+        if resizebox:
+            lines.append('}%')
+        if caption is None:
+            caption = tabular_path.replace('_', '\_')
+        lines.append(f'\caption{{{caption}}}')
+        if label is not None:
+            lines.append(f'\label{{{label}}}')
+        lines.append('\end{table}')
+
+        table_tex = '\n'.join(lines)
+
+        return table_tex
+
+    def document(self, tex_path, tabular_dir='tables', *args, **kwargs):
+        Table.Document(tex_path, tables=[self], tabular_dir=tabular_dir, *args, **kwargs)
+
+    def latexPDF(self, pdf_path, tabular_dir='tables', *args, **kwargs):
+        return Table.LatexPDF(pdf_path, tables=[self], tabular_dir=tabular_dir, *args, **kwargs)
+
+    @classmethod
+    def Document(self, tex_path, tables:List['Table'], tabular_dir='tables', *args, **kwargs):
+        lines = []
+        lines.append('\\documentclass[10pt,a4paper]{article}')
+        lines.append('\\usepackage[utf8]{inputenc}')
+        lines.append('\\usepackage{amsmath}')
+        lines.append('\\usepackage{amsfonts}')
+        lines.append('\\usepackage{amssymb}')
+        lines.append('\\usepackage{graphicx}')
+        lines.append('\\usepackage{xcolor}')
+        lines.append('\\usepackage{colortbl}')
+        lines.append('')
+        lines.append('\\begin{document}')
+        for table in tables:
+            lines.append('')
+            lines.append(table.table(os.path.join(Path(tex_path).parent, tabular_dir, table.name + '_table.tex'), *args, **kwargs))
+        lines.append('\\end{document}')
+
+        document = '\n'.join(lines)
+
+        parent = Path(tex_path).parent
+        if parent:
+            os.makedirs(parent, exist_ok=True)
+        with open(tex_path, 'wt') as foo:
+            foo.write(document)
+
+        return document
+
+    @classmethod
+    def LatexPDF(cls, pdf_path: str, tables:List['Table'], tabular_dir: str = 'tables', *args, **kwargs):
+        assert pdf_path.endswith('.pdf'), f'{pdf_path=} does not seem a valid name for a pdf file'
+        tex_path = pdf_path.replace('.pdf', '.tex')
+
+        cls.Document(tex_path, tables, tabular_dir, *args, **kwargs)
+
+        dir = Path(pdf_path).parent
+        pwd = os.getcwd()
+
+        print('currently in', pwd)
+        print("[Tables Done] runing latex")
+        os.chdir(dir)
+        os.system('pdflatex ' + Path(tex_path).name)
+        basename = Path(tex_path).name.replace('.tex', '')
+        os.system(f'rm {basename}.aux {basename}.bbl {basename}.blg {basename}.log {basename}.out {basename}.dvi')
+        os.chdir(pwd)
--- a/Census/tmp.py
+++ b/Census/tmp.py
@ -0,0 +1,16 @@
+import numpy as np
+import pandas as pd
+
+from Census.commons import AdjMatrix, load_csv, get_dataset_by_area
+
+census = './data/cens_y.csv'
+Areas, X = load_csv(census, use_yhat=True)
+data = get_dataset_by_area(Areas, X)
+
+areas = [a for a, *_ in data]
+
+print(f'Area codes={areas}')
+
+A = AdjMatrix('./data/matrice_adiacenza.csv')
+print(A.adjacent(45, 46))
+print(A.get_adjacent(50))