forked from moreo/QuaPy
adding multi-label classification methods
This commit is contained in:
parent
dc2fa05cf8
commit
4572ec266d
|
@ -7,12 +7,12 @@ from tqdm import tqdm
|
|||
from skmultilearn.dataset import load_dataset, available_data_sets
|
||||
from scipy.sparse import csr_matrix
|
||||
import quapy as qp
|
||||
from MultiLabel.main import load_results
|
||||
from MultiLabel.mlclassification import MultilabelStackedClassifier
|
||||
from MultiLabel.main import load_results, SKMULTILEARN_RED_DATASETS, TC_DATASETS, sample_size
|
||||
from MultiLabel.mlclassification import MLStackedClassifier
|
||||
from MultiLabel.mldata import MultilabelledCollection
|
||||
from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
||||
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
||||
MLACC, \
|
||||
MLPACC, MultilabelNaiveAggregativeQuantifier
|
||||
MLPACC, MLNaiveAggregativeQuantifier
|
||||
from MultiLabel.tabular import Table
|
||||
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
||||
import numpy as np
|
||||
|
@ -22,29 +22,56 @@ import sys
|
|||
import os
|
||||
import pickle
|
||||
|
||||
models = ['NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', 'NaiveHDy', 'NaiveSLD']
|
||||
datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||
models = [#'MLPE',
|
||||
'NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', #'NaiveHDy', 'NaiveSLD',
|
||||
'StackCC', 'StackPCC', 'StackACC', 'StackPACC',
|
||||
'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC',
|
||||
'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
|
||||
'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
|
||||
'LSP-CC', 'LSP-ACC'
|
||||
]
|
||||
|
||||
# datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||
datasets = TC_DATASETS
|
||||
|
||||
|
||||
|
||||
|
||||
def generate_table(path, protocol, error):
|
||||
print(f'generating {path}')
|
||||
table = Table(datasets, models)
|
||||
for dataset, model in itertools.product(datasets, models):
|
||||
|
||||
def compute_score_job(args):
|
||||
dataset, model = args
|
||||
result_path = f'{opt.results}/{dataset}_{model}.pkl'
|
||||
if os.path.exists(result_path):
|
||||
print('+', end='')
|
||||
sys.stdout.flush()
|
||||
result = load_results(result_path)
|
||||
true_prevs, estim_prevs = result[protocol]
|
||||
scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten()
|
||||
return dataset, model, scores
|
||||
print('-', end='')
|
||||
sys.stdout.flush()
|
||||
return None
|
||||
|
||||
|
||||
print(f'\ngenerating {path}')
|
||||
table = Table(datasets, models, prec_mean=4, significance_test='wilcoxon')
|
||||
results = qp.util.parallel(compute_score_job, list(itertools.product(datasets, models)), n_jobs=-1)
|
||||
print()
|
||||
|
||||
for r in results:
|
||||
if r is not None:
|
||||
dataset, model, scores = r
|
||||
table.add(dataset, model, scores)
|
||||
|
||||
tabular = """
|
||||
\\resizebox{\\textwidth}{!}{%
|
||||
\\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
|
||||
"""
|
||||
dataset_replace = {'tmc2007_500': 'tmc2007\_500'}
|
||||
dataset_replace = {'tmc2007_500': 'tmc2007\_500', 'tmc2007_500-red': 'tmc2007\_500-red'}
|
||||
method_replace = {}
|
||||
|
||||
tabular += table.latexTabular(benchmark_replace=dataset_replace, method_replace=method_replace)
|
||||
tabular += table.latexTabularT(benchmark_replace=dataset_replace, method_replace=method_replace, side=True)
|
||||
tabular += """
|
||||
\end{tabular}%
|
||||
}
|
||||
|
@ -61,13 +88,17 @@ if __name__ == '__main__':
|
|||
help=f'path where to store the tables')
|
||||
opt = parser.parse_args()
|
||||
|
||||
os.makedirs(opt.results, exist_ok=True)
|
||||
assert os.path.exists(opt.results), f'result directory {opt.results} does not exist'
|
||||
os.makedirs(opt.tablepath, exist_ok=True)
|
||||
|
||||
eval_error = qp.error.ae
|
||||
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=eval_error)
|
||||
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=eval_error)
|
||||
|
||||
qp.environ["SAMPLE_SIZE"] = sample_size
|
||||
absolute_error = qp.error.ae
|
||||
relative_absolute_error = qp.error.rae
|
||||
|
||||
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=absolute_error)
|
||||
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=absolute_error)
|
||||
generate_table(f'{opt.tablepath}/npp.rae.tex', protocol='npp', error=relative_absolute_error)
|
||||
generate_table(f'{opt.tablepath}/app.rae.tex', protocol='app', error=relative_absolute_error)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -7,11 +7,11 @@ from tqdm import tqdm
|
|||
from skmultilearn.dataset import load_dataset, available_data_sets
|
||||
from scipy.sparse import csr_matrix
|
||||
import quapy as qp
|
||||
from MultiLabel.mlclassification import MultilabelStackedClassifier
|
||||
from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, MLTwinSVM, MLknn
|
||||
from MultiLabel.mldata import MultilabelledCollection
|
||||
from MultiLabel.mlquantification import MultilabelNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
||||
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
|
||||
MLACC, \
|
||||
MLPACC, MultilabelNaiveAggregativeQuantifier
|
||||
MLPACC, MLNaiveAggregativeQuantifier, MLMLPE
|
||||
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
|
||||
import numpy as np
|
||||
from data.dataset import Dataset
|
||||
|
@ -35,80 +35,136 @@ def calibratedCls():
|
|||
sample_size = 100
|
||||
n_samples = 5000
|
||||
|
||||
SKMULTILEARN_ALL_DATASETS = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||
SKMULTILEARN_RED_DATASETS = [x+'-red' for x in SKMULTILEARN_ALL_DATASETS]
|
||||
TC_DATASETS = ['reuters21578', 'jrcall', 'ohsumed', 'rcv1']
|
||||
|
||||
DATASETS = TC_DATASETS
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def models():
|
||||
yield 'NaiveCC', MultilabelNaiveAggregativeQuantifier(CC(cls()))
|
||||
yield 'NaivePCC', MultilabelNaiveAggregativeQuantifier(PCC(cls()))
|
||||
yield 'NaiveACC', MultilabelNaiveAggregativeQuantifier(ACC(cls()))
|
||||
yield 'NaivePACC', MultilabelNaiveAggregativeQuantifier(PACC(cls()))
|
||||
# yield 'NaiveHDy', MultilabelNaiveAggregativeQuantifier(HDy(cls()))
|
||||
# yield 'NaiveSLD', MultilabelNaiveAggregativeQuantifier(EMQ(calibratedCls()))
|
||||
yield 'StackCC', MLCC(MultilabelStackedClassifier(cls()))
|
||||
yield 'StackPCC', MLPCC(MultilabelStackedClassifier(cls()))
|
||||
yield 'StackACC', MLACC(MultilabelStackedClassifier(cls()))
|
||||
yield 'StackPACC', MLPACC(MultilabelStackedClassifier(cls()))
|
||||
yield 'MLPE', MLMLPE()
|
||||
yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls()))
|
||||
yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls()))
|
||||
yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls()))
|
||||
yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls()))
|
||||
# yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls()))
|
||||
# yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls()))
|
||||
yield 'StackCC', MLCC(MLStackedClassifier(cls()))
|
||||
yield 'StackPCC', MLPCC(MLStackedClassifier(cls()))
|
||||
yield 'StackACC', MLACC(MLStackedClassifier(cls()))
|
||||
yield 'StackPACC', MLPACC(MLStackedClassifier(cls()))
|
||||
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
|
||||
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
|
||||
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
|
||||
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
|
||||
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
|
||||
yield 'MRQ-CC', MLRegressionQuantification(MultilabelNaiveQuantifier(CC(cls())), **common)
|
||||
yield 'MRQ-PCC', MLRegressionQuantification(MultilabelNaiveQuantifier(PCC(cls())), **common)
|
||||
yield 'MRQ-ACC', MLRegressionQuantification(MultilabelNaiveQuantifier(ACC(cls())), **common)
|
||||
yield 'MRQ-PACC', MLRegressionQuantification(MultilabelNaiveQuantifier(PACC(cls())), **common)
|
||||
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), **common)
|
||||
# yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||
# yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||
# yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||
# yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MultilabelStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common)
|
||||
yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common)
|
||||
yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
|
||||
yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
|
||||
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), **common)
|
||||
yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
|
||||
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
|
||||
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
|
||||
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
|
||||
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
|
||||
# yield 'LSP-CC', MLCC(LabelSpacePartion(cls()))
|
||||
# yield 'LSP-ACC', MLACC(LabelSpacePartion(cls()))
|
||||
# yield 'TwinSVM-CC', MLCC(MLTwinSVM())
|
||||
# yield 'TwinSVM-ACC', MLACC(MLTwinSVM())
|
||||
yield 'MLKNN-CC', MLCC(MLknn())
|
||||
yield 'MLKNN-PCC', MLPCC(MLknn())
|
||||
yield 'MLKNN-ACC', MLACC(MLknn())
|
||||
yield 'MLKNN-PACC', MLPACC(MLknn())
|
||||
|
||||
|
||||
# dataset = 'reuters21578'
|
||||
# picklepath = '/home/moreo/word-class-embeddings/pickles'
|
||||
# data = Dataset.load(dataset, pickle_path=f'{picklepath}/{dataset}.pickle')
|
||||
# Xtr, Xte = data.vectorize()
|
||||
# ytr = data.devel_labelmatrix.todense().getA()
|
||||
# yte = data.test_labelmatrix.todense().getA()
|
||||
def get_dataset(dataset_name, dopickle=True):
|
||||
datadir = f'{qp.util.get_quapy_home()}/pickles'
|
||||
datapath = f'{datadir}/{dataset_name}.pkl'
|
||||
if dopickle:
|
||||
if os.path.exists(datapath):
|
||||
print(f'returning pickled object in {datapath}')
|
||||
return pickle.load(open(datapath, 'rb'))
|
||||
|
||||
# remove categories with < 10 training documents
|
||||
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
||||
# ytr = ytr[:, to_keep]
|
||||
# yte = yte[:, to_keep]
|
||||
# print(f'num categories = {ytr.shape[1]}')
|
||||
if dataset_name in SKMULTILEARN_ALL_DATASETS + SKMULTILEARN_RED_DATASETS:
|
||||
clean_name = dataset_name.replace('-red','')
|
||||
Xtr, ytr, feature_names, label_names = load_dataset(clean_name, 'train')
|
||||
Xte, yte, _, _ = load_dataset(clean_name, 'test')
|
||||
print(f'n-labels = {len(label_names)}')
|
||||
|
||||
Xtr = csr_matrix(Xtr)
|
||||
Xte = csr_matrix(Xte)
|
||||
|
||||
def datasets():
|
||||
dataset_list = sorted(set([x[0] for x in available_data_sets().keys()]))
|
||||
for dataset_name in dataset_list:
|
||||
yield dataset_name
|
||||
ytr = ytr.todense().getA()
|
||||
yte = yte.todense().getA()
|
||||
|
||||
if dataset_name.endswith('-red'):
|
||||
TO_SELECT = 10
|
||||
nC = ytr.shape[1]
|
||||
tr_counts = ytr.sum(axis=0)
|
||||
te_counts = yte.sum(axis=0)
|
||||
if nC > TO_SELECT:
|
||||
Y = ytr.T.dot(ytr) # class-class coincidence matrix
|
||||
Y[np.triu_indices(nC)] = 0 # zeroing all duplicates entries and the diagonal
|
||||
order_ij = np.argsort(-Y, axis=None)
|
||||
selected = set()
|
||||
p=0
|
||||
while len(selected) < TO_SELECT:
|
||||
highest_index = order_ij[p]
|
||||
class_i = highest_index // nC
|
||||
class_j = highest_index % nC
|
||||
# if there is only one class to go, then add the most populated one
|
||||
most_populated, least_populated = (class_i, class_j) if tr_counts[class_i] > tr_counts[class_j] else (class_j, class_i)
|
||||
if te_counts[most_populated]>0:
|
||||
selected.add(most_populated)
|
||||
if len(selected) < TO_SELECT:
|
||||
if te_counts[least_populated]>0:
|
||||
selected.add(least_populated)
|
||||
p+=1
|
||||
selected = np.asarray(sorted(selected))
|
||||
ytr = ytr[:,selected]
|
||||
yte = yte[:, selected]
|
||||
# else:
|
||||
# remove categories without positives in the training or test splits
|
||||
# valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
|
||||
# ytr = ytr[:, valid_categories]
|
||||
# yte = yte[:, valid_categories]
|
||||
|
||||
def get_dataset(dataset_name):
|
||||
Xtr, ytr, feature_names, label_names = load_dataset(dataset_name, 'train')
|
||||
Xte, yte, _, _ = load_dataset(dataset_name, 'test')
|
||||
print(f'n-labels = {len(label_names)}')
|
||||
elif dataset_name in TC_DATASETS:
|
||||
picklepath = '/home/moreo/word-class-embeddings/pickles'
|
||||
data = Dataset.load(dataset_name, pickle_path=f'{picklepath}/{dataset_name}.pickle')
|
||||
Xtr, Xte = data.vectorize()
|
||||
ytr = data.devel_labelmatrix.todense().getA()
|
||||
yte = data.test_labelmatrix.todense().getA()
|
||||
|
||||
Xtr = csr_matrix(Xtr)
|
||||
Xte = csr_matrix(Xte)
|
||||
# remove categories with < 50 training or test documents
|
||||
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
|
||||
# keep the 10 most populated categories
|
||||
to_keep = np.argsort(ytr.sum(axis=0))[-10:]
|
||||
ytr = ytr[:, to_keep]
|
||||
yte = yte[:, to_keep]
|
||||
print(f'num categories = {ytr.shape[1]}')
|
||||
|
||||
ytr = ytr.todense().getA()
|
||||
yte = yte.todense().getA()
|
||||
|
||||
# remove categories without positives in the training or test splits
|
||||
valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
|
||||
ytr = ytr[:, valid_categories]
|
||||
yte = yte[:, valid_categories]
|
||||
else:
|
||||
raise ValueError(f'unknown dataset {dataset_name}')
|
||||
|
||||
train = MultilabelledCollection(Xtr, ytr)
|
||||
test = MultilabelledCollection(Xte, yte)
|
||||
|
||||
if dopickle:
|
||||
os.makedirs(datadir, exist_ok=True)
|
||||
pickle.dump((train, test), open(datapath, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return train, test
|
||||
|
||||
|
||||
|
@ -176,8 +232,8 @@ def run_experiment(dataset_name, model_name, model):
|
|||
|
||||
print(f'runing experiment {dataset_name} x {model_name}')
|
||||
train, test = get_dataset(dataset_name)
|
||||
if train.n_classes>100:
|
||||
return
|
||||
# if train.n_classes>100:
|
||||
# return
|
||||
|
||||
print_info(train, test)
|
||||
|
||||
|
@ -186,8 +242,6 @@ def run_experiment(dataset_name, model_name, model):
|
|||
results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
|
||||
results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
|
||||
save_results(results_npp, results_app, result_path)
|
||||
results_npp2, results_app2 = load_results(result_path)
|
||||
print('pass')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -198,7 +252,7 @@ if __name__ == '__main__':
|
|||
|
||||
os.makedirs(opt.results, exist_ok=True)
|
||||
|
||||
for datasetname, (modelname,model) in itertools.product(datasets(), models()):
|
||||
for datasetname, (modelname,model) in itertools.product(DATASETS, models()):
|
||||
run_experiment(datasetname, modelname, model)
|
||||
|
||||
|
||||
|
|
|
@ -4,9 +4,19 @@ from sklearn.calibration import CalibratedClassifierCV
|
|||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from skmultilearn.adapt import MLTSVM
|
||||
|
||||
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
|
||||
from skmultilearn.problem_transform import LabelPowerset
|
||||
from skmultilearn.cluster import NetworkXLabelGraphClusterer, LabelCooccurrenceGraphBuilder
|
||||
|
||||
from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
|
||||
from sklearn.manifold import SpectralEmbedding
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from skmultilearn.adapt import MLkNN
|
||||
|
||||
|
||||
class MultilabelStackedClassifier: # aka Funnelling Monolingual
|
||||
class MLStackedClassifier: # aka Funnelling Monolingual
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
if not hasattr(base_estimator, 'predict_proba'):
|
||||
print('the estimator does not seem to be probabilistic: calibrating')
|
||||
|
@ -31,4 +41,51 @@ class MultilabelStackedClassifier: # aka Funnelling Monolingual
|
|||
def predict_proba(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict_proba(P)
|
||||
return self.meta.predict_proba(P)
|
||||
|
||||
|
||||
class LabelSpacePartion:
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
|
||||
self.classifier = LabelSpacePartitioningClassifier(
|
||||
classifier=LabelPowerset(classifier=base_estimator),
|
||||
clusterer=NetworkXLabelGraphClusterer(graph_builder, method='louvain')
|
||||
)
|
||||
|
||||
def fit(self, X, y):
|
||||
return self.classifier.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return self.classifier.predict(X).todense().getA()
|
||||
|
||||
|
||||
class MLTwinSVM:
|
||||
def __init__(self):
|
||||
self.classifier = MLTSVM()
|
||||
|
||||
def fit(self, X, y):
|
||||
return self.classifier.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return self.classifier.predict(X).todense().getA()
|
||||
|
||||
|
||||
class MLknn:
|
||||
#http://scikit.ml/api/skmultilearn.embedding.classifier.html#skmultilearn.embedding.EmbeddingClassifier
|
||||
#notes: need to install package openne
|
||||
def __init__(self):
|
||||
self.classifier = EmbeddingClassifier(
|
||||
SKLearnEmbedder(SpectralEmbedding(n_components=10)),
|
||||
RandomForestRegressor(n_estimators=10),
|
||||
MLkNN(k=5)
|
||||
)
|
||||
|
||||
def fit(self, X, y):
|
||||
return self.classifier.fit(X, y)
|
||||
|
||||
def predict(self, X):
|
||||
return self.classifier.predict(X).todense().getA()
|
||||
|
||||
def predict_proba(self, X):
|
||||
return self.classifier.predict_proba(X)
|
||||
|
||||
|
|
|
@ -34,6 +34,10 @@ class MultilabelledCollection:
|
|||
def n_classes(self):
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def n_features(self):
|
||||
return self.instances.shape[1]
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return False
|
||||
|
@ -43,8 +47,8 @@ class MultilabelledCollection:
|
|||
|
||||
def sampling_multi_index(self, size, cat, prev=None):
|
||||
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
|
||||
return np.random.choice(len(self), size, replace=size>len(self))
|
||||
aux = LabelledCollection(self.__gen_index(), self.labels[:,cat])
|
||||
return np.random.choice(len(self), size, replace=size > len(self))
|
||||
aux = LabelledCollection(self.__gen_index(), self.labels[:, cat])
|
||||
return aux.sampling_index(size, *[1-prev, prev])
|
||||
|
||||
def uniform_sampling_multi_index(self, size):
|
||||
|
|
|
@ -9,7 +9,7 @@ from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, Mult
|
|||
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
|
||||
|
||||
import quapy as qp
|
||||
from MultiLabel.mlclassification import MultilabelStackedClassifier
|
||||
from MultiLabel.mlclassification import MLStackedClassifier
|
||||
from MultiLabel.mldata import MultilabelledCollection
|
||||
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
|
||||
from method.base import BaseQuantifier
|
||||
|
@ -25,7 +25,19 @@ class MLQuantifier:
|
|||
def quantify(self, instances): ...
|
||||
|
||||
|
||||
class MLMLPE(MLQuantifier):
|
||||
def fit(self, data: MultilabelledCollection):
|
||||
self.tr_prev = data.prevalence()
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
return self.tr_prev
|
||||
|
||||
|
||||
class MLAggregativeQuantifier(MLQuantifier):
|
||||
def __init__(self, mlcls):
|
||||
self.learner = mlcls
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
self.learner.fit(*data.Xy)
|
||||
return self
|
||||
|
@ -42,9 +54,6 @@ class MLAggregativeQuantifier(MLQuantifier):
|
|||
|
||||
|
||||
class MLCC(MLAggregativeQuantifier):
|
||||
def __init__(self, mlcls):
|
||||
self.learner = mlcls
|
||||
|
||||
def preclassify(self, instances):
|
||||
return self.learner.predict(instances)
|
||||
|
||||
|
@ -55,16 +64,11 @@ class MLCC(MLAggregativeQuantifier):
|
|||
|
||||
|
||||
class MLPCC(MLCC):
|
||||
def __init__(self, mlcls):
|
||||
self.learner = mlcls
|
||||
|
||||
def preclassify(self, instances):
|
||||
return self.learner.predict_proba(instances)
|
||||
|
||||
|
||||
class MLACC(MLCC):
|
||||
def __init__(self, mlcls):
|
||||
self.learner = mlcls
|
||||
|
||||
def fit(self, data:MultilabelledCollection, train_prop=0.6):
|
||||
self.classes_ = data.classes_
|
||||
|
@ -88,8 +92,6 @@ class MLACC(MLCC):
|
|||
|
||||
|
||||
class MLPACC(MLPCC):
|
||||
def __init__(self, mlcls):
|
||||
self.learner = mlcls
|
||||
|
||||
def fit(self, data:MultilabelledCollection, train_prop=0.6):
|
||||
self.classes_ = data.classes_
|
||||
|
@ -109,7 +111,7 @@ class MLPACC(MLPCC):
|
|||
return pacc_prevs
|
||||
|
||||
|
||||
class MultilabelNaiveQuantifier(MLQuantifier):
|
||||
class MLNaiveQuantifier(MLQuantifier):
|
||||
def __init__(self, q:BaseQuantifier, n_jobs=-1):
|
||||
self.q = q
|
||||
self.estimators = None
|
||||
|
@ -132,7 +134,7 @@ class MultilabelNaiveQuantifier(MLQuantifier):
|
|||
return np.asarray([neg_prevs, pos_prevs]).T
|
||||
|
||||
|
||||
class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregativeQuantifier):
|
||||
class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier):
|
||||
def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
|
||||
assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
|
||||
self.q = q
|
||||
|
@ -156,7 +158,7 @@ class MultilabelNaiveAggregativeQuantifier(MultilabelNaiveQuantifier, MLAggregat
|
|||
|
||||
class MLRegressionQuantification:
|
||||
def __init__(self,
|
||||
mlquantifier=MultilabelNaiveQuantifier(CC(LinearSVC())),
|
||||
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
|
||||
regression='ridge',
|
||||
protocol='npp',
|
||||
n_samples=500,
|
||||
|
@ -201,36 +203,31 @@ class MLRegressionQuantification:
|
|||
|
||||
return Xs, ys
|
||||
|
||||
def _extract_features(self, sample, Xs, ys, samples_mean, samples_std):
|
||||
ys.append(sample.prevalence()[:, 1])
|
||||
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
|
||||
if self.means:
|
||||
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
|
||||
if self.stds:
|
||||
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
|
||||
|
||||
def generate_samples_npp(self, val):
|
||||
samples_mean = []
|
||||
samples_std = []
|
||||
Xs = []
|
||||
ys = []
|
||||
Xs, ys = [], []
|
||||
samples_mean, samples_std = [], []
|
||||
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
|
||||
ys.append(sample.prevalence()[:, 1])
|
||||
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
|
||||
if self.means:
|
||||
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
|
||||
if self.stds:
|
||||
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
|
||||
self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
|
||||
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
|
||||
|
||||
|
||||
def generate_samples_app(self, val):
|
||||
samples_mean = []
|
||||
samples_std = []
|
||||
Xs = []
|
||||
ys = []
|
||||
Xs, ys = [], []
|
||||
samples_mean, samples_std = [], []
|
||||
ncats = len(self.classes_)
|
||||
nprevs = 21
|
||||
repeats = max(self.n_samples // (ncats * nprevs), 1)
|
||||
for cat in self.classes_:
|
||||
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
|
||||
ys.append(sample.prevalence()[:, 1])
|
||||
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
|
||||
if self.means:
|
||||
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
|
||||
if self.stds:
|
||||
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
|
||||
self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
|
||||
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
|
||||
|
||||
def fit(self, data:MultilabelledCollection):
|
||||
|
|
|
@ -6,10 +6,10 @@ from scipy.stats import ttest_ind_from_stats, wilcoxon
|
|||
class Table:
|
||||
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
||||
|
||||
def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
|
||||
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='ttest', prec_mean=3,
|
||||
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
|
||||
color=True):
|
||||
assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
||||
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
||||
|
||||
self.benchmarks = np.asarray(benchmarks)
|
||||
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
|
||||
|
@ -21,7 +21,7 @@ class Table:
|
|||
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
||||
self._addmap('values', dtype=object)
|
||||
self.lower_is_better = lower_is_better
|
||||
self.ttest = ttest
|
||||
self.ttest = significance_test
|
||||
self.prec_mean = prec_mean
|
||||
self.clean_zero = clean_zero
|
||||
self.show_std = show_std
|
||||
|
@ -156,8 +156,9 @@ class Table:
|
|||
return all(self.map['fill'][:, self.method_index[col]])
|
||||
|
||||
def _addave(self):
|
||||
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, ttest=self.ttest, average=False,
|
||||
missing=self.missing, missing_str=self.missing_str)
|
||||
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
|
||||
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
|
||||
show_std=self.show_std)
|
||||
for col in self.methods:
|
||||
values = None
|
||||
if self._is_column_full(col):
|
||||
|
@ -267,12 +268,37 @@ class Table:
|
|||
tab += self.latexAverage()
|
||||
return tab
|
||||
|
||||
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
|
||||
def withside(label):
|
||||
return '\side{'+label+'}' if side else label
|
||||
|
||||
tab = ' & '
|
||||
tab += ' & '.join([withside(benchmark_replace.get(col, col)) for col in self.benchmarks])
|
||||
if average:
|
||||
tab += ' & ' + withside('Ave')
|
||||
tab += ' \\\\\hline\n'
|
||||
for row in self.methods:
|
||||
rowname = method_replace.get(row, row)
|
||||
tab += rowname + ' & '
|
||||
tab += self.latexRowT(row, endl='')
|
||||
if average:
|
||||
tab += ' & '
|
||||
tab += self.average.latexCell('ave', row)
|
||||
tab += '\\\\\hline\n'
|
||||
return tab
|
||||
|
||||
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
||||
s = [self.latexCell(benchmark, col) for col in self.methods]
|
||||
s = ' & '.join(s)
|
||||
s += ' ' + endl
|
||||
return s
|
||||
|
||||
def latexRowT(self, method, endl='\\\\\hline\n'):
|
||||
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
|
||||
s = ' & '.join(s)
|
||||
s += ' ' + endl
|
||||
return s
|
||||
|
||||
def latexAverage(self, endl='\\\\\hline\n'):
|
||||
if self.add_average:
|
||||
return self.average.latexRow('ave', endl=endl)
|
||||
|
|
Loading…
Reference in New Issue