From 4572ec266dce37b56d904598df1df417866e321c Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 2 Sep 2021 11:07:33 +0200 Subject: [PATCH] adding multi-label classification methods --- MultiLabel/gentables.py | 63 ++++++++---- MultiLabel/main.py | 170 ++++++++++++++++++++++----------- MultiLabel/mlclassification.py | 61 +++++++++++- MultiLabel/mldata.py | 8 +- MultiLabel/mlquantification.py | 65 ++++++------- MultiLabel/tabular.py | 36 ++++++- 6 files changed, 286 insertions(+), 117 deletions(-) diff --git a/MultiLabel/gentables.py b/MultiLabel/gentables.py index 5124289..750bb68 100644 --- a/MultiLabel/gentables.py +++ b/MultiLabel/gentables.py @@ -7,12 +7,12 @@ from tqdm import tqdm from skmultilearn.dataset import load_dataset, available_data_sets from scipy.sparse import csr_matrix import quapy as qp -from MultiLabel.main import load_results -from MultiLabel.mlclassification import MultilabelStackedClassifier +from MultiLabel.main import load_results, SKMULTILEARN_RED_DATASETS, TC_DATASETS, sample_size +from MultiLabel.mlclassification import MLStackedClassifier from MultiLabel.mldata import MultilabelledCollection -from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \ +from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \ MLACC, \ - MLPACC, MultilabelNaiveAggregativeQuantifier + MLPACC, MLNaiveAggregativeQuantifier from MultiLabel.tabular import Table from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy import numpy as np @@ -22,29 +22,56 @@ import sys import os import pickle -models = ['NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', 'NaiveHDy', 'NaiveSLD'] -datasets = sorted(set([x[0] for x in available_data_sets().keys()])) +models = [#'MLPE', + 'NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', #'NaiveHDy', 'NaiveSLD', + 'StackCC', 'StackPCC', 'StackACC', 'StackPACC', + 'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC', + 'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC', + 'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app', + 'LSP-CC', 'LSP-ACC' +] + +# datasets = sorted(set([x[0] for x in available_data_sets().keys()])) +datasets = TC_DATASETS + + def generate_table(path, protocol, error): - print(f'generating {path}') - table = Table(datasets, models) - for dataset, model in itertools.product(datasets, models): + + def compute_score_job(args): + dataset, model = args result_path = f'{opt.results}/{dataset}_{model}.pkl' if os.path.exists(result_path): + print('+', end='') + sys.stdout.flush() result = load_results(result_path) true_prevs, estim_prevs = result[protocol] scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten() + return dataset, model, scores + print('-', end='') + sys.stdout.flush() + return None + + + print(f'\ngenerating {path}') + table = Table(datasets, models, prec_mean=4, significance_test='wilcoxon') + results = qp.util.parallel(compute_score_job, list(itertools.product(datasets, models)), n_jobs=-1) + print() + + for r in results: + if r is not None: + dataset, model, scores = r table.add(dataset, model, scores) tabular = """ \\resizebox{\\textwidth}{!}{% \\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline """ - dataset_replace = {'tmc2007_500': 'tmc2007\_500'} + dataset_replace = {'tmc2007_500': 'tmc2007\_500', 'tmc2007_500-red': 'tmc2007\_500-red'} method_replace = {} - tabular += table.latexTabular(benchmark_replace=dataset_replace, method_replace=method_replace) + tabular += table.latexTabularT(benchmark_replace=dataset_replace, method_replace=method_replace, side=True) tabular += """ \end{tabular}% } @@ -61,13 +88,17 @@ if __name__ == '__main__': help=f'path where to store the tables') opt = parser.parse_args() - os.makedirs(opt.results, exist_ok=True) + assert os.path.exists(opt.results), f'result directory {opt.results} does not exist' os.makedirs(opt.tablepath, exist_ok=True) - eval_error = qp.error.ae - generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=eval_error) - generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=eval_error) - + qp.environ["SAMPLE_SIZE"] = sample_size + absolute_error = qp.error.ae + relative_absolute_error = qp.error.rae + + generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=absolute_error) + generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=absolute_error) + generate_table(f'{opt.tablepath}/npp.rae.tex', protocol='npp', error=relative_absolute_error) + generate_table(f'{opt.tablepath}/app.rae.tex', protocol='app', error=relative_absolute_error) diff --git a/MultiLabel/main.py b/MultiLabel/main.py index ed3f14d..8941340 100644 --- a/MultiLabel/main.py +++ b/MultiLabel/main.py @@ -7,11 +7,11 @@ from tqdm import tqdm from skmultilearn.dataset import load_dataset, available_data_sets from scipy.sparse import csr_matrix import quapy as qp -from MultiLabel.mlclassification import MultilabelStackedClassifier +from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, MLTwinSVM, MLknn from MultiLabel.mldata import MultilabelledCollection -from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \ +from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \ MLACC, \ - MLPACC, MultilabelNaiveAggregativeQuantifier + MLPACC, MLNaiveAggregativeQuantifier, MLMLPE from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy import numpy as np from data.dataset import Dataset @@ -35,80 +35,136 @@ def calibratedCls(): sample_size = 100 n_samples = 5000 +SKMULTILEARN_ALL_DATASETS = sorted(set([x[0] for x in available_data_sets().keys()])) +SKMULTILEARN_RED_DATASETS = [x+'-red' for x in SKMULTILEARN_ALL_DATASETS] +TC_DATASETS = ['reuters21578', 'jrcall', 'ohsumed', 'rcv1'] + +DATASETS = TC_DATASETS + + + + def models(): - yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls())) - yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls())) - yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls())) - yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls())) - # yield 'NaiveHDy', MultilabelNaiveAggregativeQuantifier(HDy(cls())) - # yield 'NaiveSLD', MultilabelNaiveAggregativeQuantifier(EMQ(calibratedCls())) - yield 'StackCC', MLCC(MultilabelStackedClassifier(cls())) - yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls())) - yield 'StackACC', MLACC(MultilabelStackedClassifier(cls())) - yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls())) + yield 'MLPE', MLMLPE() + yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls())) + yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls())) + yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls())) + yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls())) + # yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls())) + # yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls())) + yield 'StackCC', MLCC(MLStackedClassifier(cls())) + yield 'StackPCC', MLPCC(MLStackedClassifier(cls())) + yield 'StackACC', MLACC(MLStackedClassifier(cls())) + yield 'StackPACC', MLPACC(MLStackedClassifier(cls())) # yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random')) # yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random')) common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'} - yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common) - yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common) - yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common) - yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common) - yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common) - yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common) - yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common) - yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common) - # yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common) - # yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common) - # yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common) - # yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common) + yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common) + yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common) + yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common) + yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common) + yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common) + yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common) + yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common) + yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), **common) + yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), protocol='app', **common) + yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common) + yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common) + yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common) # yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common) # yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common) + # yield 'LSP-CC', MLCC(LabelSpacePartion(cls())) + # yield 'LSP-ACC', MLACC(LabelSpacePartion(cls())) + # yield 'TwinSVM-CC', MLCC(MLTwinSVM()) + # yield 'TwinSVM-ACC', MLACC(MLTwinSVM()) + yield 'MLKNN-CC', MLCC(MLknn()) + yield 'MLKNN-PCC', MLPCC(MLknn()) + yield 'MLKNN-ACC', MLACC(MLknn()) + yield 'MLKNN-PACC', MLPACC(MLknn()) -# dataset = 'reuters21578' -# picklepath = '/home/moreo/word-class-embeddings/pickles' -# data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle') -# Xtr, Xte = data.vectorize() -# ytr = data.devel_labelmatrix.todense().getA() -# yte = data.test_labelmatrix.todense().getA() +def get_dataset(dataset_name, dopickle=True): + datadir = f'{qp.util.get_quapy_home()}/pickles' + datapath = f'{datadir}/{dataset_name}.pkl' + if dopickle: + if os.path.exists(datapath): + print(f'returning pickled object in {datapath}') + return pickle.load(open(datapath, 'rb')) -# remove categories with < 10 training documents -# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50) -# ytr = ytr[:, to_keep] -# yte = yte[:, to_keep] -# print(f'num categories = {ytr.shape[1]}') + if dataset_name in SKMULTILEARN_ALL_DATASETS + SKMULTILEARN_RED_DATASETS: + clean_name = dataset_name.replace('-red','') + Xtr, ytr, feature_names, label_names = load_dataset(clean_name, 'train') + Xte, yte, _, _ = load_dataset(clean_name, 'test') + print(f'n-labels = {len(label_names)}') + Xtr = csr_matrix(Xtr) + Xte = csr_matrix(Xte) -def datasets(): - dataset_list = sorted(set([x[0] for x in available_data_sets().keys()])) - for dataset_name in dataset_list: - yield dataset_name + ytr = ytr.todense().getA() + yte = yte.todense().getA() + if dataset_name.endswith('-red'): + TO_SELECT = 10 + nC = ytr.shape[1] + tr_counts = ytr.sum(axis=0) + te_counts = yte.sum(axis=0) + if nC > TO_SELECT: + Y = ytr.T.dot(ytr) # class-class coincidence matrix + Y[np.triu_indices(nC)] = 0 # zeroing all duplicates entries and the diagonal + order_ij = np.argsort(-Y, axis=None) + selected = set() + p=0 + while len(selected) < TO_SELECT: + highest_index = order_ij[p] + class_i = highest_index // nC + class_j = highest_index % nC + # if there is only one class to go, then add the most populated one + most_populated, least_populated = (class_i, class_j) if tr_counts[class_i] > tr_counts[class_j] else (class_j, class_i) + if te_counts[most_populated]>0: + selected.add(most_populated) + if len(selected) < TO_SELECT: + if te_counts[least_populated]>0: + selected.add(least_populated) + p+=1 + selected = np.asarray(sorted(selected)) + ytr = ytr[:,selected] + yte = yte[:, selected] + # else: + # remove categories without positives in the training or test splits + # valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5) + # ytr = ytr[:, valid_categories] + # yte = yte[:, valid_categories] -def get_dataset(dataset_name): - Xtr, ytr, feature_names, label_names = load_dataset(dataset_name, 'train') - Xte, yte, _, _ = load_dataset(dataset_name, 'test') - print(f'n-labels = {len(label_names)}') + elif dataset_name in TC_DATASETS: + picklepath = '/home/moreo/word-class-embeddings/pickles' + data = Dataset.load(dataset_name, pickle_path=f'{picklepath}/{dataset_name}.pickle') + Xtr, Xte = data.vectorize() + ytr = data.devel_labelmatrix.todense().getA() + yte = data.test_labelmatrix.todense().getA() - Xtr = csr_matrix(Xtr) - Xte = csr_matrix(Xte) + # remove categories with < 50 training or test documents + # to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50) + # keep the 10 most populated categories + to_keep = np.argsort(ytr.sum(axis=0))[-10:] + ytr = ytr[:, to_keep] + yte = yte[:, to_keep] + print(f'num categories = {ytr.shape[1]}') - ytr = ytr.todense().getA() - yte = yte.todense().getA() - - # remove categories without positives in the training or test splits - valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5) - ytr = ytr[:, valid_categories] - yte = yte[:, valid_categories] + else: + raise ValueError(f'unknown dataset {dataset_name}') train = MultilabelledCollection(Xtr, ytr) test = MultilabelledCollection(Xte, yte) + if dopickle: + os.makedirs(datadir, exist_ok=True) + pickle.dump((train, test), open(datapath, 'wb'), pickle.HIGHEST_PROTOCOL) + return train, test @@ -176,8 +232,8 @@ def run_experiment(dataset_name, model_name, model): print(f'runing experiment {dataset_name} x {model_name}') train, test = get_dataset(dataset_name) - if train.n_classes>100: - return + # if train.n_classes>100: + # return print_info(train, test) @@ -186,8 +242,6 @@ def run_experiment(dataset_name, model_name, model): results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100) results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5) save_results(results_npp, results_app, result_path) - results_npp2, results_app2 = load_results(result_path) - print('pass') if __name__ == '__main__': @@ -198,7 +252,7 @@ if __name__ == '__main__': os.makedirs(opt.results, exist_ok=True) - for datasetname, (modelname,model) in itertools.product(datasets(), models()): + for datasetname, (modelname,model) in itertools.product(DATASETS, models()): run_experiment(datasetname, modelname, model) diff --git a/MultiLabel/mlclassification.py b/MultiLabel/mlclassification.py index 636aadb..af81a28 100644 --- a/MultiLabel/mlclassification.py +++ b/MultiLabel/mlclassification.py @@ -4,9 +4,19 @@ from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import StandardScaler +from skmultilearn.adapt import MLTSVM + +from skmultilearn.ensemble import LabelSpacePartitioningClassifier +from skmultilearn.problem_transform import LabelPowerset +from skmultilearn.cluster import NetworkXLabelGraphClusterer, LabelCooccurrenceGraphBuilder + +from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier +from sklearn.manifold import SpectralEmbedding +from sklearn.ensemble import RandomForestRegressor +from skmultilearn.adapt import MLkNN -class MultilabelStackedClassifier: # aka Funnelling Monolingual +class MLStackedClassifier: # aka Funnelling Monolingual def __init__(self, base_estimator=LogisticRegression()): if not hasattr(base_estimator, 'predict_proba'): print('the estimator does not seem to be probabilistic: calibrating') @@ -31,4 +41,51 @@ class MultilabelStackedClassifier: # aka Funnelling Monolingual def predict_proba(self, X): P = self.base.predict_proba(X) P = self.norm.transform(P) - return self.meta.predict_proba(P) \ No newline at end of file + return self.meta.predict_proba(P) + + +class LabelSpacePartion: + def __init__(self, base_estimator=LogisticRegression()): + graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False) + self.classifier = LabelSpacePartitioningClassifier( + classifier=LabelPowerset(classifier=base_estimator), + clusterer=NetworkXLabelGraphClusterer(graph_builder, method='louvain') + ) + + def fit(self, X, y): + return self.classifier.fit(X, y) + + def predict(self, X): + return self.classifier.predict(X).todense().getA() + + +class MLTwinSVM: + def __init__(self): + self.classifier = MLTSVM() + + def fit(self, X, y): + return self.classifier.fit(X, y) + + def predict(self, X): + return self.classifier.predict(X).todense().getA() + + +class MLknn: + #http://scikit.ml/api/skmultilearn.embedding.classifier.html#skmultilearn.embedding.EmbeddingClassifier + #notes: need to install package openne + def __init__(self): + self.classifier = EmbeddingClassifier( + SKLearnEmbedder(SpectralEmbedding(n_components=10)), + RandomForestRegressor(n_estimators=10), + MLkNN(k=5) + ) + + def fit(self, X, y): + return self.classifier.fit(X, y) + + def predict(self, X): + return self.classifier.predict(X).todense().getA() + + def predict_proba(self, X): + return self.classifier.predict_proba(X) + diff --git a/MultiLabel/mldata.py b/MultiLabel/mldata.py index 284144f..b4b68ec 100644 --- a/MultiLabel/mldata.py +++ b/MultiLabel/mldata.py @@ -34,6 +34,10 @@ class MultilabelledCollection: def n_classes(self): return len(self.classes_) + @property + def n_features(self): + return self.instances.shape[1] + @property def binary(self): return False @@ -43,8 +47,8 @@ class MultilabelledCollection: def sampling_multi_index(self, size, cat, prev=None): if prev is None: # no prevalence was indicated; returns an index for uniform sampling - return np.random.choice(len(self), size, replace=size>len(self)) - aux = LabelledCollection(self.__gen_index(), self.labels[:,cat]) + return np.random.choice(len(self), size, replace=size > len(self)) + aux = LabelledCollection(self.__gen_index(), self.labels[:, cat]) return aux.sampling_index(size, *[1-prev, prev]) def uniform_sampling_multi_index(self, size): diff --git a/MultiLabel/mlquantification.py b/MultiLabel/mlquantification.py index 13bc719..7276267 100644 --- a/MultiLabel/mlquantification.py +++ b/MultiLabel/mlquantification.py @@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, Mult ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor import quapy as qp -from MultiLabel.mlclassification import MultilabelStackedClassifier +from MultiLabel.mlclassification import MLStackedClassifier from MultiLabel.mldata import MultilabelledCollection from method.aggregative import CC, ACC, PACC, AggregativeQuantifier from method.base import BaseQuantifier @@ -25,7 +25,19 @@ class MLQuantifier: def quantify(self, instances): ... +class MLMLPE(MLQuantifier): + def fit(self, data: MultilabelledCollection): + self.tr_prev = data.prevalence() + return self + + def quantify(self, instances): + return self.tr_prev + + class MLAggregativeQuantifier(MLQuantifier): + def __init__(self, mlcls): + self.learner = mlcls + def fit(self, data:MultilabelledCollection): self.learner.fit(*data.Xy) return self @@ -42,9 +54,6 @@ class MLAggregativeQuantifier(MLQuantifier): class MLCC(MLAggregativeQuantifier): - def __init__(self, mlcls): - self.learner = mlcls - def preclassify(self, instances): return self.learner.predict(instances) @@ -55,16 +64,11 @@ class MLCC(MLAggregativeQuantifier): class MLPCC(MLCC): - def __init__(self, mlcls): - self.learner = mlcls - def preclassify(self, instances): return self.learner.predict_proba(instances) class MLACC(MLCC): - def __init__(self, mlcls): - self.learner = mlcls def fit(self, data:MultilabelledCollection, train_prop=0.6): self.classes_ = data.classes_ @@ -88,8 +92,6 @@ class MLACC(MLCC): class MLPACC(MLPCC): - def __init__(self, mlcls): - self.learner = mlcls def fit(self, data:MultilabelledCollection, train_prop=0.6): self.classes_ = data.classes_ @@ -109,7 +111,7 @@ class MLPACC(MLPCC): return pacc_prevs -class MultilabelNaiveQuantifier(MLQuantifier): +class MLNaiveQuantifier(MLQuantifier): def __init__(self, q:BaseQuantifier, n_jobs=-1): self.q = q self.estimators = None @@ -132,7 +134,7 @@ class MultilabelNaiveQuantifier(MLQuantifier): return np.asarray([neg_prevs, pos_prevs]).T -class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregativeQuantifier): +class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier): def __init__(self, q:AggregativeQuantifier, n_jobs=-1): assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!' self.q = q @@ -156,7 +158,7 @@ class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregat class MLRegressionQuantification: def __init__(self, - mlquantifier=MultilabelNaiveQuantifier(CC(LinearSVC())), + mlquantifier=MLNaiveQuantifier(CC(LinearSVC())), regression='ridge', protocol='npp', n_samples=500, @@ -201,36 +203,31 @@ class MLRegressionQuantification: return Xs, ys + def _extract_features(self, sample, Xs, ys, samples_mean, samples_std): + ys.append(sample.prevalence()[:, 1]) + Xs.append(self.estimator.quantify(sample.instances)[:, 1]) + if self.means: + samples_mean.append(sample.instances.mean(axis=0).getA().flatten()) + if self.stds: + samples_std.append(sample.instances.todense().std(axis=0).getA().flatten()) + def generate_samples_npp(self, val): - samples_mean = [] - samples_std = [] - Xs = [] - ys = [] + Xs, ys = [], [] + samples_mean, samples_std = [], [] for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples): - ys.append(sample.prevalence()[:, 1]) - Xs.append(self.estimator.quantify(sample.instances)[:, 1]) - if self.means: - samples_mean.append(sample.instances.mean(axis=0).getA().flatten()) - if self.stds: - samples_std.append(sample.instances.todense().std(axis=0).getA().flatten()) + self._extract_features(self, sample, Xs, ys, samples_mean, samples_std) return self._prepare_arrays(Xs, ys, samples_mean, samples_std) + def generate_samples_app(self, val): - samples_mean = [] - samples_std = [] - Xs = [] - ys = [] + Xs, ys = [], [] + samples_mean, samples_std = [], [] ncats = len(self.classes_) nprevs = 21 repeats = max(self.n_samples // (ncats * nprevs), 1) for cat in self.classes_: for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats): - ys.append(sample.prevalence()[:, 1]) - Xs.append(self.estimator.quantify(sample.instances)[:, 1]) - if self.means: - samples_mean.append(sample.instances.mean(axis=0).getA().flatten()) - if self.stds: - samples_std.append(sample.instances.todense().std(axis=0).getA().flatten()) + self._extract_features(self, sample, Xs, ys, samples_mean, samples_std) return self._prepare_arrays(Xs, ys, samples_mean, samples_std) def fit(self, data:MultilabelledCollection): diff --git a/MultiLabel/tabular.py b/MultiLabel/tabular.py index 44f519d..b8a6b42 100644 --- a/MultiLabel/tabular.py +++ b/MultiLabel/tabular.py @@ -6,10 +6,10 @@ from scipy.stats import ttest_ind_from_stats, wilcoxon class Table: VALID_TESTS = [None, "wilcoxon", "ttest"] - def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3, + def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='ttest', prec_mean=3, clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--', color=True): - assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}' + assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}' self.benchmarks = np.asarray(benchmarks) self.benchmark_index = {row: i for i, row in enumerate(benchmarks)} @@ -21,7 +21,7 @@ class Table: # keyed (#rows,#cols)-ndarrays holding computations from self.map['values'] self._addmap('values', dtype=object) self.lower_is_better = lower_is_better - self.ttest = ttest + self.ttest = significance_test self.prec_mean = prec_mean self.clean_zero = clean_zero self.show_std = show_std @@ -156,8 +156,9 @@ class Table: return all(self.map['fill'][:, self.method_index[col]]) def _addave(self): - ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, ttest=self.ttest, average=False, - missing=self.missing, missing_str=self.missing_str) + ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False, + missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std, + show_std=self.show_std) for col in self.methods: values = None if self._is_column_full(col): @@ -267,12 +268,37 @@ class Table: tab += self.latexAverage() return tab + def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False): + def withside(label): + return '\side{'+label+'}' if side else label + + tab = ' & ' + tab += ' & '.join([withside(benchmark_replace.get(col, col)) for col in self.benchmarks]) + if average: + tab += ' & ' + withside('Ave') + tab += ' \\\\\hline\n' + for row in self.methods: + rowname = method_replace.get(row, row) + tab += rowname + ' & ' + tab += self.latexRowT(row, endl='') + if average: + tab += ' & ' + tab += self.average.latexCell('ave', row) + tab += '\\\\\hline\n' + return tab + def latexRow(self, benchmark, endl='\\\\\hline\n'): s = [self.latexCell(benchmark, col) for col in self.methods] s = ' & '.join(s) s += ' ' + endl return s + def latexRowT(self, method, endl='\\\\\hline\n'): + s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks] + s = ' & '.join(s) + s += ' ' + endl + return s + def latexAverage(self, endl='\\\\\hline\n'): if self.add_average: return self.average.latexRow('ave', endl=endl)