1
0
Fork 0

adding multi-label classification methods

This commit is contained in:
Alejandro Moreo Fernandez 2021-09-02 11:07:33 +02:00
parent dc2fa05cf8
commit 4572ec266d
6 changed files with 286 additions and 117 deletions

View File

@ -7,12 +7,12 @@ from tqdm import tqdm
from skmultilearn.dataset import load_dataset, available_data_sets
from scipy.sparse import csr_matrix
import quapy as qp
from MultiLabel.main import load_results
from MultiLabel.mlclassification import MultilabelStackedClassifier
from MultiLabel.main import load_results, SKMULTILEARN_RED_DATASETS, TC_DATASETS, sample_size
from MultiLabel.mlclassification import MLStackedClassifier
from MultiLabel.mldata import MultilabelledCollection
from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
MLACC, \
MLPACC, MultilabelNaiveAggregativeQuantifier
MLPACC, MLNaiveAggregativeQuantifier
from MultiLabel.tabular import Table
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
import numpy as np
@ -22,29 +22,56 @@ import sys
import os
import pickle
models = ['NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', 'NaiveHDy', 'NaiveSLD']
datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
models = [#'MLPE',
'NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', #'NaiveHDy', 'NaiveSLD',
'StackCC', 'StackPCC', 'StackACC', 'StackPACC',
'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC',
'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
'LSP-CC', 'LSP-ACC'
]
# datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
datasets = TC_DATASETS
def generate_table(path, protocol, error):
print(f'generating {path}')
table = Table(datasets, models)
for dataset, model in itertools.product(datasets, models):
def compute_score_job(args):
dataset, model = args
result_path = f'{opt.results}/{dataset}_{model}.pkl'
if os.path.exists(result_path):
print('+', end='')
sys.stdout.flush()
result = load_results(result_path)
true_prevs, estim_prevs = result[protocol]
scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten()
return dataset, model, scores
print('-', end='')
sys.stdout.flush()
return None
print(f'\ngenerating {path}')
table = Table(datasets, models, prec_mean=4, significance_test='wilcoxon')
results = qp.util.parallel(compute_score_job, list(itertools.product(datasets, models)), n_jobs=-1)
print()
for r in results:
if r is not None:
dataset, model, scores = r
table.add(dataset, model, scores)
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
"""
dataset_replace = {'tmc2007_500': 'tmc2007\_500'}
dataset_replace = {'tmc2007_500': 'tmc2007\_500', 'tmc2007_500-red': 'tmc2007\_500-red'}
method_replace = {}
tabular += table.latexTabular(benchmark_replace=dataset_replace, method_replace=method_replace)
tabular += table.latexTabularT(benchmark_replace=dataset_replace, method_replace=method_replace, side=True)
tabular += """
\end{tabular}%
}
@ -61,13 +88,17 @@ if __name__ == '__main__':
help=f'path where to store the tables')
opt = parser.parse_args()
os.makedirs(opt.results, exist_ok=True)
assert os.path.exists(opt.results), f'result directory {opt.results} does not exist'
os.makedirs(opt.tablepath, exist_ok=True)
eval_error = qp.error.ae
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=eval_error)
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=eval_error)
qp.environ["SAMPLE_SIZE"] = sample_size
absolute_error = qp.error.ae
relative_absolute_error = qp.error.rae
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=absolute_error)
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=absolute_error)
generate_table(f'{opt.tablepath}/npp.rae.tex', protocol='npp', error=relative_absolute_error)
generate_table(f'{opt.tablepath}/app.rae.tex', protocol='app', error=relative_absolute_error)

View File

@ -7,11 +7,11 @@ from tqdm import tqdm
from skmultilearn.dataset import load_dataset, available_data_sets
from scipy.sparse import csr_matrix
import quapy as qp
from MultiLabel.mlclassification import MultilabelStackedClassifier
from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, MLTwinSVM, MLknn
from MultiLabel.mldata import MultilabelledCollection
from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
MLACC, \
MLPACC, MultilabelNaiveAggregativeQuantifier
MLPACC, MLNaiveAggregativeQuantifier, MLMLPE
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
import numpy as np
from data.dataset import Dataset
@ -35,64 +35,71 @@ def calibratedCls():
sample_size = 100
n_samples = 5000
SKMULTILEARN_ALL_DATASETS = sorted(set([x[0] for x in available_data_sets().keys()]))
SKMULTILEARN_RED_DATASETS = [x+'-red' for x in SKMULTILEARN_ALL_DATASETS]
TC_DATASETS = ['reuters21578', 'jrcall', 'ohsumed', 'rcv1']
DATASETS = TC_DATASETS
def models():
yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls()))
yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
# yield 'NaiveHDy', MultilabelNaiveAggregativeQuantifier(HDy(cls()))
# yield 'NaiveSLD', MultilabelNaiveAggregativeQuantifier(EMQ(calibratedCls()))
yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
yield 'StackACC', MLACC(MultilabelStackedClassifier(cls()))
yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls()))
yield 'MLPE', MLMLPE()
yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls()))
yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls()))
yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls()))
yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls()))
# yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls()))
# yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls()))
yield 'StackCC', MLCC(MLStackedClassifier(cls()))
yield 'StackPCC', MLPCC(MLStackedClassifier(cls()))
yield 'StackACC', MLACC(MLStackedClassifier(cls()))
yield 'StackPACC', MLPACC(MLStackedClassifier(cls()))
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common)
yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common)
yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common)
# yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
# yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
# yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
# yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common)
yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common)
yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
# yield 'LSP-CC', MLCC(LabelSpacePartion(cls()))
# yield 'LSP-ACC', MLACC(LabelSpacePartion(cls()))
# yield 'TwinSVM-CC', MLCC(MLTwinSVM())
# yield 'TwinSVM-ACC', MLACC(MLTwinSVM())
yield 'MLKNN-CC', MLCC(MLknn())
yield 'MLKNN-PCC', MLPCC(MLknn())
yield 'MLKNN-ACC', MLACC(MLknn())
yield 'MLKNN-PACC', MLPACC(MLknn())
# dataset = 'reuters21578'
# picklepath = '/home/moreo/word-class-embeddings/pickles'
# data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle')
# Xtr, Xte = data.vectorize()
# ytr = data.devel_labelmatrix.todense().getA()
# yte = data.test_labelmatrix.todense().getA()
def get_dataset(dataset_name, dopickle=True):
datadir = f'{qp.util.get_quapy_home()}/pickles'
datapath = f'{datadir}/{dataset_name}.pkl'
if dopickle:
if os.path.exists(datapath):
print(f'returning pickled object in {datapath}')
return pickle.load(open(datapath, 'rb'))
# remove categories with < 10 training documents
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
# ytr = ytr[:, to_keep]
# yte = yte[:, to_keep]
# print(f'num categories = {ytr.shape[1]}')
def datasets():
dataset_list = sorted(set([x[0] for x in available_data_sets().keys()]))
for dataset_name in dataset_list:
yield dataset_name
def get_dataset(dataset_name):
Xtr, ytr, feature_names, label_names = load_dataset(dataset_name, 'train')
Xte, yte, _, _ = load_dataset(dataset_name, 'test')
if dataset_name in SKMULTILEARN_ALL_DATASETS + SKMULTILEARN_RED_DATASETS:
clean_name = dataset_name.replace('-red','')
Xtr, ytr, feature_names, label_names = load_dataset(clean_name, 'train')
Xte, yte, _, _ = load_dataset(clean_name, 'test')
print(f'n-labels = {len(label_names)}')
Xtr = csr_matrix(Xtr)
@ -101,14 +108,63 @@ def get_dataset(dataset_name):
ytr = ytr.todense().getA()
yte = yte.todense().getA()
if dataset_name.endswith('-red'):
TO_SELECT = 10
nC = ytr.shape[1]
tr_counts = ytr.sum(axis=0)
te_counts = yte.sum(axis=0)
if nC > TO_SELECT:
Y = ytr.T.dot(ytr) # class-class coincidence matrix
Y[np.triu_indices(nC)] = 0 # zeroing all duplicates entries and the diagonal
order_ij = np.argsort(-Y, axis=None)
selected = set()
p=0
while len(selected) < TO_SELECT:
highest_index = order_ij[p]
class_i = highest_index // nC
class_j = highest_index % nC
# if there is only one class to go, then add the most populated one
most_populated, least_populated = (class_i, class_j) if tr_counts[class_i] > tr_counts[class_j] else (class_j, class_i)
if te_counts[most_populated]>0:
selected.add(most_populated)
if len(selected) < TO_SELECT:
if te_counts[least_populated]>0:
selected.add(least_populated)
p+=1
selected = np.asarray(sorted(selected))
ytr = ytr[:,selected]
yte = yte[:, selected]
# else:
# remove categories without positives in the training or test splits
valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
ytr = ytr[:, valid_categories]
yte = yte[:, valid_categories]
# valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
# ytr = ytr[:, valid_categories]
# yte = yte[:, valid_categories]
elif dataset_name in TC_DATASETS:
picklepath = '/home/moreo/word-class-embeddings/pickles'
data = Dataset.load(dataset_name, pickle_path=f'{picklepath}/{dataset_name}.pickle')
Xtr, Xte = data.vectorize()
ytr = data.devel_labelmatrix.todense().getA()
yte = data.test_labelmatrix.todense().getA()
# remove categories with < 50 training or test documents
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
# keep the 10 most populated categories
to_keep = np.argsort(ytr.sum(axis=0))[-10:]
ytr = ytr[:, to_keep]
yte = yte[:, to_keep]
print(f'num categories = {ytr.shape[1]}')
else:
raise ValueError(f'unknown dataset {dataset_name}')
train = MultilabelledCollection(Xtr, ytr)
test = MultilabelledCollection(Xte, yte)
if dopickle:
os.makedirs(datadir, exist_ok=True)
pickle.dump((train, test), open(datapath, 'wb'), pickle.HIGHEST_PROTOCOL)
return train, test
@ -176,8 +232,8 @@ def run_experiment(dataset_name, model_name, model):
print(f'runing experiment {dataset_name} x {model_name}')
train, test = get_dataset(dataset_name)
if train.n_classes>100:
return
# if train.n_classes>100:
# return
print_info(train, test)
@ -186,8 +242,6 @@ def run_experiment(dataset_name, model_name, model):
results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
save_results(results_npp, results_app, result_path)
results_npp2, results_app2 = load_results(result_path)
print('pass')
if __name__ == '__main__':
@ -198,7 +252,7 @@ if __name__ == '__main__':
os.makedirs(opt.results, exist_ok=True)
for datasetname, (modelname,model) in itertools.product(datasets(), models()):
for datasetname, (modelname,model) in itertools.product(DATASETS, models()):
run_experiment(datasetname, modelname, model)

View File

@ -4,9 +4,19 @@ from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from skmultilearn.adapt import MLTSVM
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.cluster import NetworkXLabelGraphClusterer, LabelCooccurrenceGraphBuilder
from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
from sklearn.manifold import SpectralEmbedding
from sklearn.ensemble import RandomForestRegressor
from skmultilearn.adapt import MLkNN
class MultilabelStackedClassifier: # aka Funnelling Monolingual
class MLStackedClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating')
@ -32,3 +42,50 @@ class MultilabelStackedClassifier: # aka Funnelling Monolingual
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict_proba(P)
class LabelSpacePartion:
def __init__(self, base_estimator=LogisticRegression()):
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
self.classifier = LabelSpacePartitioningClassifier(
classifier=LabelPowerset(classifier=base_estimator),
clusterer=NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
class MLTwinSVM:
def __init__(self):
self.classifier = MLTSVM()
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
class MLknn:
#http://scikit.ml/api/skmultilearn.embedding.classifier.html#skmultilearn.embedding.EmbeddingClassifier
#notes: need to install package openne
def __init__(self):
self.classifier = EmbeddingClassifier(
SKLearnEmbedder(SpectralEmbedding(n_components=10)),
RandomForestRegressor(n_estimators=10),
MLkNN(k=5)
)
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
def predict_proba(self, X):
return self.classifier.predict_proba(X)

View File

@ -34,6 +34,10 @@ class MultilabelledCollection:
def n_classes(self):
return len(self.classes_)
@property
def n_features(self):
return self.instances.shape[1]
@property
def binary(self):
return False

View File

@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, Mult
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
import quapy as qp
from MultiLabel.mlclassification import MultilabelStackedClassifier
from MultiLabel.mlclassification import MLStackedClassifier
from MultiLabel.mldata import MultilabelledCollection
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
from method.base import BaseQuantifier
@ -25,7 +25,19 @@ class MLQuantifier:
def quantify(self, instances): ...
class MLMLPE(MLQuantifier):
def fit(self, data: MultilabelledCollection):
self.tr_prev = data.prevalence()
return self
def quantify(self, instances):
return self.tr_prev
class MLAggregativeQuantifier(MLQuantifier):
def __init__(self, mlcls):
self.learner = mlcls
def fit(self, data:MultilabelledCollection):
self.learner.fit(*data.Xy)
return self
@ -42,9 +54,6 @@ class MLAggregativeQuantifier(MLQuantifier):
class MLCC(MLAggregativeQuantifier):
def __init__(self, mlcls):
self.learner = mlcls
def preclassify(self, instances):
return self.learner.predict(instances)
@ -55,16 +64,11 @@ class MLCC(MLAggregativeQuantifier):
class MLPCC(MLCC):
def __init__(self, mlcls):
self.learner = mlcls
def preclassify(self, instances):
return self.learner.predict_proba(instances)
class MLACC(MLCC):
def __init__(self, mlcls):
self.learner = mlcls
def fit(self, data:MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
@ -88,8 +92,6 @@ class MLACC(MLCC):
class MLPACC(MLPCC):
def __init__(self, mlcls):
self.learner = mlcls
def fit(self, data:MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
@ -109,7 +111,7 @@ class MLPACC(MLPCC):
return pacc_prevs
class MultilabelNaiveQuantifier(MLQuantifier):
class MLNaiveQuantifier(MLQuantifier):
def __init__(self, q:BaseQuantifier, n_jobs=-1):
self.q = q
self.estimators = None
@ -132,7 +134,7 @@ class MultilabelNaiveQuantifier(MLQuantifier):
return np.asarray([neg_prevs, pos_prevs]).T
class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregativeQuantifier):
class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier):
def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
self.q = q
@ -156,7 +158,7 @@ class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregat
class MLRegressionQuantification:
def __init__(self,
mlquantifier=MultilabelNaiveQuantifier(CC(LinearSVC())),
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
regression='ridge',
protocol='npp',
n_samples=500,
@ -201,36 +203,31 @@ class MLRegressionQuantification:
return Xs, ys
def generate_samples_npp(self, val):
samples_mean = []
samples_std = []
Xs = []
ys = []
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
def _extract_features(self, sample, Xs, ys, samples_mean, samples_std):
ys.append(sample.prevalence()[:, 1])
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
if self.means:
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
if self.stds:
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
def generate_samples_npp(self, val):
Xs, ys = [], []
samples_mean, samples_std = [], []
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def generate_samples_app(self, val):
samples_mean = []
samples_std = []
Xs = []
ys = []
Xs, ys = [], []
samples_mean, samples_std = [], []
ncats = len(self.classes_)
nprevs = 21
repeats = max(self.n_samples // (ncats * nprevs), 1)
for cat in self.classes_:
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
ys.append(sample.prevalence()[:, 1])
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
if self.means:
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
if self.stds:
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def fit(self, data:MultilabelledCollection):

View File

@ -6,10 +6,10 @@ from scipy.stats import ttest_ind_from_stats, wilcoxon
class Table:
VALID_TESTS = [None, "wilcoxon", "ttest"]
def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='ttest', prec_mean=3,
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
color=True):
assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
self.benchmarks = np.asarray(benchmarks)
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
@ -21,7 +21,7 @@ class Table:
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
self._addmap('values', dtype=object)
self.lower_is_better = lower_is_better
self.ttest = ttest
self.ttest = significance_test
self.prec_mean = prec_mean
self.clean_zero = clean_zero
self.show_std = show_std
@ -156,8 +156,9 @@ class Table:
return all(self.map['fill'][:, self.method_index[col]])
def _addave(self):
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, ttest=self.ttest, average=False,
missing=self.missing, missing_str=self.missing_str)
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
show_std=self.show_std)
for col in self.methods:
values = None
if self._is_column_full(col):
@ -267,12 +268,37 @@ class Table:
tab += self.latexAverage()
return tab
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
def withside(label):
return '\side{'+label+'}' if side else label
tab = ' & '
tab += ' & '.join([withside(benchmark_replace.get(col, col)) for col in self.benchmarks])
if average:
tab += ' & ' + withside('Ave')
tab += ' \\\\\hline\n'
for row in self.methods:
rowname = method_replace.get(row, row)
tab += rowname + ' & '
tab += self.latexRowT(row, endl='')
if average:
tab += ' & '
tab += self.average.latexCell('ave', row)
tab += '\\\\\hline\n'
return tab
def latexRow(self, benchmark, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, col) for col in self.methods]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexRowT(self, method, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexAverage(self, endl='\\\\\hline\n'):
if self.add_average:
return self.average.latexRow('ave', endl=endl)