adding multi-label classification methods

2021-09-02 11:07:33 +02:00 · 2021-09-02 11:07:33 +02:00 · 4572ec266d
parent dc2fa05cf8
commit 4572ec266d
6 changed files with 286 additions and 117 deletions
--- a/MultiLabel/gentables.py
+++ b/MultiLabel/gentables.py
@ -7,12 +7,12 @@ from tqdm import tqdm
 from skmultilearn.dataset import load_dataset, available_data_sets
 from scipy.sparse import csr_matrix
 import quapy as qp
-from MultiLabel.main import load_results
-from MultiLabel.mlclassification import MultilabelStackedClassifier
+from MultiLabel.main import load_results, SKMULTILEARN_RED_DATASETS, TC_DATASETS, sample_size
+from MultiLabel.mlclassification import MLStackedClassifier
 from MultiLabel.mldata import MultilabelledCollection
-from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
+from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
    MLACC, \
-    MLPACC, MultilabelNaiveAggregativeQuantifier
+    MLPACC, MLNaiveAggregativeQuantifier
 from MultiLabel.tabular import Table
 from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
 import numpy as np
@ -22,29 +22,56 @@ import sys
 import os
 import pickle

-models = ['NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', 'NaiveHDy', 'NaiveSLD']
-datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
+models = [#'MLPE',
+          'NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', #'NaiveHDy', 'NaiveSLD',
+          'StackCC', 'StackPCC', 'StackACC', 'StackPACC',
+          'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC',
+          'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
+          'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
+          'LSP-CC', 'LSP-ACC'
+]
+
+# datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
+datasets = TC_DATASETS
+
+


 def generate_table(path, protocol, error):
-    print(f'generating {path}')
-    table = Table(datasets, models)
-    for dataset, model in itertools.product(datasets, models):
+
+    def compute_score_job(args):
+        dataset, model = args
        result_path = f'{opt.results}/{dataset}_{model}.pkl'
        if os.path.exists(result_path):
+            print('+', end='')
+            sys.stdout.flush()
            result = load_results(result_path)
            true_prevs, estim_prevs = result[protocol]
            scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten()
+            return dataset, model, scores
+        print('-', end='')
+        sys.stdout.flush()
+        return None
+
+
+    print(f'\ngenerating {path}')
+    table = Table(datasets, models, prec_mean=4, significance_test='wilcoxon')
+    results = qp.util.parallel(compute_score_job, list(itertools.product(datasets, models)), n_jobs=-1)
+    print()
+
+    for r in results:
+        if r is not None:
+            dataset, model, scores = r
            table.add(dataset, model, scores)

    tabular = """
    \\resizebox{\\textwidth}{!}{%
            \\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
            """
-    dataset_replace = {'tmc2007_500': 'tmc2007\_500'}
+    dataset_replace = {'tmc2007_500': 'tmc2007\_500', 'tmc2007_500-red': 'tmc2007\_500-red'}
    method_replace = {}

-    tabular += table.latexTabular(benchmark_replace=dataset_replace, method_replace=method_replace)
+    tabular += table.latexTabularT(benchmark_replace=dataset_replace, method_replace=method_replace, side=True)
    tabular += """
        \end{tabular}%
        }
@ -61,13 +88,17 @@ if __name__ == '__main__':
                        help=f'path where to store the tables')
    opt = parser.parse_args()

-    os.makedirs(opt.results, exist_ok=True)
+    assert os.path.exists(opt.results), f'result directory {opt.results} does not exist'
    os.makedirs(opt.tablepath, exist_ok=True)

-    eval_error = qp.error.ae
-    generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=eval_error)
-    generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=eval_error)
-
+    qp.environ["SAMPLE_SIZE"] = sample_size
+    absolute_error = qp.error.ae
+    relative_absolute_error = qp.error.rae
+
+    generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=absolute_error)
+    generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=absolute_error)
+    generate_table(f'{opt.tablepath}/npp.rae.tex', protocol='npp', error=relative_absolute_error)
+    generate_table(f'{opt.tablepath}/app.rae.tex', protocol='app', error=relative_absolute_error)



--- a/MultiLabel/main.py
+++ b/MultiLabel/main.py
@ -7,11 +7,11 @@ from tqdm import tqdm
 from skmultilearn.dataset import load_dataset, available_data_sets
 from scipy.sparse import csr_matrix
 import quapy as qp
-from MultiLabel.mlclassification import MultilabelStackedClassifier
+from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, MLTwinSVM, MLknn
 from MultiLabel.mldata import MultilabelledCollection
-from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
+from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
    MLACC, \
-    MLPACC, MultilabelNaiveAggregativeQuantifier
+    MLPACC, MLNaiveAggregativeQuantifier, MLMLPE
 from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
 import numpy as np
 from data.dataset  import Dataset
@ -35,80 +35,136 @@ def calibratedCls():
 sample_size = 100
 n_samples = 5000

+SKMULTILEARN_ALL_DATASETS = sorted(set([x[0] for x in available_data_sets().keys()]))
+SKMULTILEARN_RED_DATASETS = [x+'-red' for x in SKMULTILEARN_ALL_DATASETS]
+TC_DATASETS = ['reuters21578', 'jrcall', 'ohsumed', 'rcv1']
+
+DATASETS = TC_DATASETS
+
+
+
+

 def models():
-    yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls()))
-    yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
-    yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
-    yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
-    # yield 'NaiveHDy', MultilabelNaiveAggregativeQuantifier(HDy(cls()))
-    # yield 'NaiveSLD', MultilabelNaiveAggregativeQuantifier(EMQ(calibratedCls()))
-    yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
-    yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
-    yield 'StackACC', MLACC(MultilabelStackedClassifier(cls()))
-    yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls()))
+    yield 'MLPE', MLMLPE()
+    yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls()))
+    yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls()))
+    yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls()))
+    yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls()))
+    # yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls()))
+    # yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls()))
+    yield 'StackCC', MLCC(MLStackedClassifier(cls()))
+    yield 'StackPCC', MLPCC(MLStackedClassifier(cls()))
+    yield 'StackACC', MLACC(MLStackedClassifier(cls()))
+    yield 'StackPACC', MLPACC(MLStackedClassifier(cls()))
    # yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
    # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
    # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
    # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
    common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
-    yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
-    yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())),  **common)
-    yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())),  **common)
-    yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
-    yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
-    yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
-    yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
-    yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())),  **common)
-    # yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
-    # yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
-    # yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
-    # yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app',  **common)
+    yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common)
+    yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common)
+    yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
+    yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
+    yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common)
+    yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common)
+    yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common)
+    yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), **common)
+    yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
+    yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
+    yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
+    yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
    # yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
    # yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
    # yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
    # yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
+    # yield 'LSP-CC', MLCC(LabelSpacePartion(cls()))
+    # yield 'LSP-ACC', MLACC(LabelSpacePartion(cls()))
+    # yield 'TwinSVM-CC', MLCC(MLTwinSVM())
+    # yield 'TwinSVM-ACC', MLACC(MLTwinSVM())
+    yield 'MLKNN-CC', MLCC(MLknn())
+    yield 'MLKNN-PCC', MLPCC(MLknn())
+    yield 'MLKNN-ACC', MLACC(MLknn())
+    yield 'MLKNN-PACC', MLPACC(MLknn())


-# dataset = 'reuters21578'
-# picklepath = '/home/moreo/word-class-embeddings/pickles'
-# data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle')
-# Xtr, Xte = data.vectorize()
-# ytr = data.devel_labelmatrix.todense().getA()
-# yte = data.test_labelmatrix.todense().getA()
+def get_dataset(dataset_name, dopickle=True):
+    datadir = f'{qp.util.get_quapy_home()}/pickles'
+    datapath = f'{datadir}/{dataset_name}.pkl'
+    if dopickle:
+        if os.path.exists(datapath):
+            print(f'returning pickled object in {datapath}')
+            return pickle.load(open(datapath, 'rb'))

-# remove categories with < 10 training documents
-# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
-# ytr = ytr[:, to_keep]
-# yte = yte[:, to_keep]
-# print(f'num categories = {ytr.shape[1]}')
+    if dataset_name in SKMULTILEARN_ALL_DATASETS + SKMULTILEARN_RED_DATASETS:
+        clean_name = dataset_name.replace('-red','')
+        Xtr, ytr, feature_names, label_names = load_dataset(clean_name, 'train')
+        Xte, yte, _, _ = load_dataset(clean_name, 'test')
+        print(f'n-labels = {len(label_names)}')

+        Xtr = csr_matrix(Xtr)
+        Xte = csr_matrix(Xte)

-def datasets():
-    dataset_list = sorted(set([x[0] for x in available_data_sets().keys()]))
-    for dataset_name in dataset_list:
-        yield dataset_name
+        ytr = ytr.todense().getA()
+        yte = yte.todense().getA()

+        if dataset_name.endswith('-red'):
+            TO_SELECT = 10
+            nC = ytr.shape[1]
+            tr_counts = ytr.sum(axis=0)
+            te_counts = yte.sum(axis=0)
+            if nC > TO_SELECT:
+                Y = ytr.T.dot(ytr)  # class-class coincidence matrix
+                Y[np.triu_indices(nC)] = 0  # zeroing all duplicates entries and the diagonal
+                order_ij = np.argsort(-Y, axis=None)
+                selected = set()
+                p=0
+                while len(selected) < TO_SELECT:
+                    highest_index = order_ij[p]
+                    class_i = highest_index // nC
+                    class_j = highest_index % nC
+                    # if there is only one class to go, then add the most populated one
+                    most_populated, least_populated = (class_i, class_j) if tr_counts[class_i] > tr_counts[class_j] else (class_j, class_i)
+                    if te_counts[most_populated]>0:
+                        selected.add(most_populated)
+                    if len(selected) < TO_SELECT:
+                        if te_counts[least_populated]>0:
+                            selected.add(least_populated)
+                    p+=1
+                selected = np.asarray(sorted(selected))
+                ytr = ytr[:,selected]
+                yte = yte[:, selected]
+        # else:
+            # remove categories without positives in the training or test splits
+            # valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
+            # ytr = ytr[:, valid_categories]
+            # yte = yte[:, valid_categories]

-def get_dataset(dataset_name):
-    Xtr, ytr, feature_names, label_names = load_dataset(dataset_name, 'train')
-    Xte, yte, _, _ = load_dataset(dataset_name, 'test')
-    print(f'n-labels = {len(label_names)}')
+    elif dataset_name in TC_DATASETS:
+        picklepath = '/home/moreo/word-class-embeddings/pickles'
+        data = Dataset.load(dataset_name, pickle_path=f'{picklepath}/{dataset_name}.pickle')
+        Xtr, Xte = data.vectorize()
+        ytr = data.devel_labelmatrix.todense().getA()
+        yte = data.test_labelmatrix.todense().getA()

-    Xtr = csr_matrix(Xtr)
-    Xte = csr_matrix(Xte)
+        # remove categories with < 50 training or test documents
+        # to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
+        # keep the 10 most populated categories
+        to_keep = np.argsort(ytr.sum(axis=0))[-10:]
+        ytr = ytr[:, to_keep]
+        yte = yte[:, to_keep]
+        print(f'num categories = {ytr.shape[1]}')

-    ytr = ytr.todense().getA()
-    yte = yte.todense().getA()
-
-    # remove categories without positives in the training or test splits
-    valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
-    ytr = ytr[:, valid_categories]
-    yte = yte[:, valid_categories]
+    else:
+        raise ValueError(f'unknown dataset {dataset_name}')

    train = MultilabelledCollection(Xtr, ytr)
    test = MultilabelledCollection(Xte, yte)

+    if dopickle:
+        os.makedirs(datadir, exist_ok=True)
+        pickle.dump((train, test), open(datapath, 'wb'), pickle.HIGHEST_PROTOCOL)
+
    return train, test


@ -176,8 +232,8 @@ def run_experiment(dataset_name, model_name, model):

    print(f'runing experiment {dataset_name} x {model_name}')
    train, test = get_dataset(dataset_name)
-    if train.n_classes>100:
-        return
+    # if train.n_classes>100:
+    #     return

    print_info(train, test)

@ -186,8 +242,6 @@ def run_experiment(dataset_name, model_name, model):
    results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
    results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
    save_results(results_npp, results_app, result_path)
-    results_npp2, results_app2 = load_results(result_path)
-    print('pass')


 if __name__ == '__main__':
@ -198,7 +252,7 @@ if __name__ == '__main__':

    os.makedirs(opt.results, exist_ok=True)

-    for datasetname, (modelname,model) in itertools.product(datasets(), models()):
+    for datasetname, (modelname,model) in itertools.product(DATASETS, models()):
        run_experiment(datasetname, modelname, model)


--- a/MultiLabel/mlclassification.py
+++ b/MultiLabel/mlclassification.py
@ -4,9 +4,19 @@ from sklearn.calibration import CalibratedClassifierCV
 from sklearn.linear_model import LogisticRegression
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.preprocessing import StandardScaler
+from skmultilearn.adapt import MLTSVM
+
+from skmultilearn.ensemble import LabelSpacePartitioningClassifier
+from skmultilearn.problem_transform import LabelPowerset
+from skmultilearn.cluster import NetworkXLabelGraphClusterer, LabelCooccurrenceGraphBuilder
+
+from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
+from sklearn.manifold import SpectralEmbedding
+from sklearn.ensemble import RandomForestRegressor
+from skmultilearn.adapt import MLkNN


-class MultilabelStackedClassifier:  # aka Funnelling Monolingual
+class MLStackedClassifier:  # aka Funnelling Monolingual
    def __init__(self, base_estimator=LogisticRegression()):
        if not hasattr(base_estimator, 'predict_proba'):
            print('the estimator does not seem to be probabilistic: calibrating')
@ -31,4 +41,51 @@ class MultilabelStackedClassifier:  # aka Funnelling Monolingual
    def predict_proba(self, X):
        P = self.base.predict_proba(X)
        P = self.norm.transform(P)
-        return self.meta.predict_proba(P)
+        return self.meta.predict_proba(P)
+
+
+class LabelSpacePartion:
+    def __init__(self, base_estimator=LogisticRegression()):
+        graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
+        self.classifier = LabelSpacePartitioningClassifier(
+            classifier=LabelPowerset(classifier=base_estimator),
+            clusterer=NetworkXLabelGraphClusterer(graph_builder, method='louvain')
+        )
+
+    def fit(self, X, y):
+        return self.classifier.fit(X, y)
+
+    def predict(self, X):
+        return self.classifier.predict(X).todense().getA()
+
+
+class MLTwinSVM:
+    def __init__(self):
+        self.classifier = MLTSVM()
+
+    def fit(self, X, y):
+        return self.classifier.fit(X, y)
+
+    def predict(self, X):
+        return self.classifier.predict(X).todense().getA()
+
+
+class MLknn:
+    #http://scikit.ml/api/skmultilearn.embedding.classifier.html#skmultilearn.embedding.EmbeddingClassifier
+    #notes: need to install package openne
+    def __init__(self):
+        self.classifier = EmbeddingClassifier(
+            SKLearnEmbedder(SpectralEmbedding(n_components=10)),
+            RandomForestRegressor(n_estimators=10),
+            MLkNN(k=5)
+        )
+
+    def fit(self, X, y):
+        return self.classifier.fit(X, y)
+
+    def predict(self, X):
+        return self.classifier.predict(X).todense().getA()
+
+    def predict_proba(self, X):
+        return self.classifier.predict_proba(X)
+
--- a/MultiLabel/mldata.py
+++ b/MultiLabel/mldata.py
@ -34,6 +34,10 @@ class MultilabelledCollection:
    def n_classes(self):
        return len(self.classes_)

+    @property
+    def n_features(self):
+        return self.instances.shape[1]
+
    @property
    def binary(self):
        return False
@ -43,8 +47,8 @@ class MultilabelledCollection:

    def sampling_multi_index(self, size, cat, prev=None):
        if prev is None:  # no prevalence was indicated; returns an index for uniform sampling
-            return np.random.choice(len(self), size, replace=size>len(self))
-        aux = LabelledCollection(self.__gen_index(), self.labels[:,cat])
+            return np.random.choice(len(self), size, replace=size > len(self))
+        aux = LabelledCollection(self.__gen_index(), self.labels[:, cat])
        return aux.sampling_index(size, *[1-prev, prev])

    def uniform_sampling_multi_index(self, size):
--- a/MultiLabel/mlquantification.py
+++ b/MultiLabel/mlquantification.py
@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, Mult
    ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor

 import quapy as qp
-from MultiLabel.mlclassification import MultilabelStackedClassifier
+from MultiLabel.mlclassification import MLStackedClassifier
 from MultiLabel.mldata import MultilabelledCollection
 from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
 from method.base import BaseQuantifier
@ -25,7 +25,19 @@ class MLQuantifier:
    def quantify(self, instances): ...


+class MLMLPE(MLQuantifier):
+    def fit(self, data: MultilabelledCollection):
+        self.tr_prev = data.prevalence()
+        return self
+
+    def quantify(self, instances):
+        return self.tr_prev
+
+
 class MLAggregativeQuantifier(MLQuantifier):
+    def __init__(self, mlcls):
+        self.learner = mlcls
+
    def fit(self, data:MultilabelledCollection):
        self.learner.fit(*data.Xy)
        return self
@ -42,9 +54,6 @@ class MLAggregativeQuantifier(MLQuantifier):


 class MLCC(MLAggregativeQuantifier):
-    def __init__(self, mlcls):
-        self.learner = mlcls
-
    def preclassify(self, instances):
        return self.learner.predict(instances)

@ -55,16 +64,11 @@ class MLCC(MLAggregativeQuantifier):


 class MLPCC(MLCC):
-    def __init__(self, mlcls):
-        self.learner = mlcls
-
    def preclassify(self, instances):
        return self.learner.predict_proba(instances)


 class MLACC(MLCC):
-    def __init__(self, mlcls):
-        self.learner = mlcls

    def fit(self, data:MultilabelledCollection, train_prop=0.6):
        self.classes_ = data.classes_
@ -88,8 +92,6 @@ class MLACC(MLCC):


 class MLPACC(MLPCC):
-    def __init__(self, mlcls):
-        self.learner = mlcls

    def fit(self, data:MultilabelledCollection, train_prop=0.6):
        self.classes_ = data.classes_
@ -109,7 +111,7 @@ class MLPACC(MLPCC):
        return pacc_prevs


-class MultilabelNaiveQuantifier(MLQuantifier):
+class MLNaiveQuantifier(MLQuantifier):
    def __init__(self, q:BaseQuantifier, n_jobs=-1):
        self.q = q
        self.estimators = None
@ -132,7 +134,7 @@ class MultilabelNaiveQuantifier(MLQuantifier):
        return np.asarray([neg_prevs, pos_prevs]).T


-class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregativeQuantifier):
+class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier):
    def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
        assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
        self.q = q
@ -156,7 +158,7 @@ class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregat

 class MLRegressionQuantification:
    def __init__(self,
-                 mlquantifier=MultilabelNaiveQuantifier(CC(LinearSVC())),
+                 mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
                 regression='ridge',
                 protocol='npp',
                 n_samples=500,
@ -201,36 +203,31 @@ class MLRegressionQuantification:

        return Xs, ys

+    def _extract_features(self, sample, Xs, ys, samples_mean, samples_std):
+        ys.append(sample.prevalence()[:, 1])
+        Xs.append(self.estimator.quantify(sample.instances)[:, 1])
+        if self.means:
+            samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
+        if self.stds:
+            samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
+
    def generate_samples_npp(self, val):
-        samples_mean = []
-        samples_std = []
-        Xs = []
-        ys = []
+        Xs, ys = [], []
+        samples_mean, samples_std = [], []
        for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
-            ys.append(sample.prevalence()[:, 1])
-            Xs.append(self.estimator.quantify(sample.instances)[:, 1])
-            if self.means:
-                samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
-            if self.stds:
-                samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
+            self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
        return self._prepare_arrays(Xs, ys, samples_mean, samples_std)

+
    def generate_samples_app(self, val):
-        samples_mean = []
-        samples_std = []
-        Xs = []
-        ys = []
+        Xs, ys = [], []
+        samples_mean, samples_std = [], []
        ncats = len(self.classes_)
        nprevs  = 21
        repeats = max(self.n_samples // (ncats * nprevs), 1)
        for cat in self.classes_:
            for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
-                ys.append(sample.prevalence()[:, 1])
-                Xs.append(self.estimator.quantify(sample.instances)[:, 1])
-                if self.means:
-                    samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
-                if self.stds:
-                    samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
+                self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
        return self._prepare_arrays(Xs, ys, samples_mean, samples_std)

    def fit(self, data:MultilabelledCollection):
--- a/MultiLabel/tabular.py
+++ b/MultiLabel/tabular.py
@ -6,10 +6,10 @@ from scipy.stats import ttest_ind_from_stats, wilcoxon
 class Table:
    VALID_TESTS = [None, "wilcoxon", "ttest"]

-    def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
+    def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='ttest', prec_mean=3,
                 clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
                 color=True):
-        assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
+        assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'

        self.benchmarks = np.asarray(benchmarks)
        self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
@ -21,7 +21,7 @@ class Table:
        # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
        self._addmap('values', dtype=object)
        self.lower_is_better = lower_is_better
-        self.ttest = ttest
+        self.ttest = significance_test
        self.prec_mean = prec_mean
        self.clean_zero = clean_zero
        self.show_std = show_std
@ -156,8 +156,9 @@ class Table:
        return all(self.map['fill'][:, self.method_index[col]])

    def _addave(self):
-        ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, ttest=self.ttest, average=False,
-                    missing=self.missing, missing_str=self.missing_str)
+        ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
+                    missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
+                    show_std=self.show_std)
        for col in self.methods:
            values = None
            if self._is_column_full(col):
@ -267,12 +268,37 @@ class Table:
            tab += self.latexAverage()
        return tab

+    def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
+        def withside(label):
+            return '\side{'+label+'}' if side else label
+
+        tab = ' & '
+        tab += ' & '.join([withside(benchmark_replace.get(col, col)) for col in self.benchmarks])
+        if average:
+            tab += ' & ' + withside('Ave')
+        tab += ' \\\\\hline\n'
+        for row in self.methods:
+            rowname = method_replace.get(row, row)
+            tab += rowname + ' & '
+            tab += self.latexRowT(row, endl='')
+            if average:
+                tab += ' & '
+                tab += self.average.latexCell('ave', row)
+                tab += '\\\\\hline\n'
+        return tab
+
    def latexRow(self, benchmark, endl='\\\\\hline\n'):
        s = [self.latexCell(benchmark, col) for col in self.methods]
        s = ' & '.join(s)
        s += ' ' + endl
        return s

+    def latexRowT(self, method, endl='\\\\\hline\n'):
+        s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
+        s = ' & '.join(s)
+        s += ' ' + endl
+        return s
+
    def latexAverage(self, endl='\\\\\hline\n'):
        if self.add_average:
            return self.average.latexRow('ave', endl=endl)