committing last changes before creating a branch

This commit is contained in:
Alejandro Moreo Fernandez 2021-10-13 11:53:19 +02:00
parent 4572ec266d
commit 6f3f103b3b
5 changed files with 197 additions and 30 deletions

View File

@ -23,12 +23,19 @@ import os
import pickle
models = [#'MLPE',
'NaiveCC', 'NaivePCC', 'NaiveACC', 'NaivePACC', #'NaiveHDy', 'NaiveSLD',
'StackCC', 'StackPCC', 'StackACC', 'StackPACC',
'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC',
'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
'LSP-CC', 'LSP-ACC'
'NaiveCC', 'NaivePCC', 'NaivePCCcal', 'NaiveACC', 'NaivePACC', 'NaivePACCcal', 'NaiveACCit', 'NaivePACCit',
#'NaiveHDy', 'NaiveSLD',
'ChainCC', 'ChainPCC', 'ChainACC', 'ChainPACC',
'StackCC', 'StackPCC', 'StackPCCcal', 'StackACC', 'StackPACC', 'StackPACCcal', 'StackACCit', 'StackP'
'ACCit',
'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC', 'MRQ-ACCit', 'MRQ-PACCit',
'StackMRQ-CC', 'StackMRQ-PCC', 'StackMRQ-ACC', 'StackMRQ-PACC',
'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
'StackMRQ-StackCC', 'StackMRQ-StackPCC', 'StackMRQ-StackACC', 'StackMRQ-StackPACC',
'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
'StackMRQ-StackCC-app', 'StackMRQ-StackPCC-app', 'StackMRQ-StackACC-app', 'StackMRQ-StackPACC-app',
'LSP-CC', 'LSP-ACC', 'MLKNN-CC', 'MLKNN-ACC',
'MLAdjustedC', 'MLStackAdjustedC', 'MLprobAdjustedC', 'MLStackProbAdjustedC'
]
# datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
@ -64,6 +71,12 @@ def generate_table(path, protocol, error):
dataset, model, scores = r
table.add(dataset, model, scores)
save_table(table, path)
save_table(table.getRankTable(), path.replace('.tex','.rank.tex'))
def save_table(table, path):
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
@ -79,7 +92,6 @@ def generate_table(path, protocol, error):
with open(path, 'wt') as foo:
foo.write(tabular)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Experiments for multi-label quantification')
parser.add_argument('--results', type=str, default='./results', metavar='str',

View File

@ -2,6 +2,8 @@ import argparse
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
import itertools
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from tqdm import tqdm
from skmultilearn.dataset import load_dataset, available_data_sets
@ -11,7 +13,7 @@ from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion,
from MultiLabel.mldata import MultilabelledCollection
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
MLACC, \
MLPACC, MLNaiveAggregativeQuantifier, MLMLPE
MLPACC, MLNaiveAggregativeQuantifier, MLMLPE, StackMLRQuantifier, MLadjustedCount, MLprobAdjustedCount
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
import numpy as np
from data.dataset import Dataset
@ -49,23 +51,33 @@ def models():
yield 'MLPE', MLMLPE()
yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls()))
yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls()))
yield 'NaivePCCcal', MLNaiveAggregativeQuantifier(PCC(calibratedCls()))
yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls()))
yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls()))
yield 'NaivePACCcal', MLNaiveAggregativeQuantifier(PACC(calibratedCls()))
yield 'NaiveACCit', MLNaiveAggregativeQuantifier(ACC(cls()))
yield 'NaivePACCit', MLNaiveAggregativeQuantifier(PACC(cls()))
# yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls()))
# yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls()))
yield 'StackCC', MLCC(MLStackedClassifier(cls()))
yield 'StackPCC', MLPCC(MLStackedClassifier(cls()))
yield 'StackPCCcal', MLPCC(MLStackedClassifier(calibratedCls()))
yield 'StackACC', MLACC(MLStackedClassifier(cls()))
yield 'StackPACC', MLPACC(MLStackedClassifier(cls()))
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None, order='random'))
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None, order='random'))
yield 'StackPACCcal', MLPACC(MLStackedClassifier(calibratedCls()))
yield 'StackACCit', MLACC(MLStackedClassifier(cls()))
yield 'StackPACCit', MLPACC(MLStackedClassifier(cls()))
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None))
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None))
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None))
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None))
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common)
yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common)
yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
yield 'MRQ-ACCit', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
yield 'MRQ-PACCit', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common)
@ -74,6 +86,23 @@ def models():
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'StackMRQ-CC', StackMLRQuantifier(MLNaiveQuantifier(CC(cls())), **common)
yield 'StackMRQ-PCC', StackMLRQuantifier(MLNaiveQuantifier(PCC(cls())), **common)
yield 'StackMRQ-ACC', StackMLRQuantifier(MLNaiveQuantifier(ACC(cls())), **common)
yield 'StackMRQ-PACC', StackMLRQuantifier(MLNaiveQuantifier(PACC(cls())), **common)
yield 'StackMRQ-StackCC', StackMLRQuantifier(MLCC(MLStackedClassifier(cls())), **common)
yield 'StackMRQ-StackPCC', StackMLRQuantifier(MLPCC(MLStackedClassifier(cls())), **common)
yield 'StackMRQ-StackACC', StackMLRQuantifier(MLACC(MLStackedClassifier(cls())), **common)
yield 'StackMRQ-StackPACC', StackMLRQuantifier(MLPACC(MLStackedClassifier(cls())), **common)
yield 'StackMRQ-StackCC-app', StackMLRQuantifier(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'StackMRQ-StackPCC-app', StackMLRQuantifier(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'StackMRQ-StackACC-app', StackMLRQuantifier(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'StackMRQ-StackPACC-app', StackMLRQuantifier(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MLAdjustedC', MLadjustedCount(OneVsRestClassifier(cls()))
yield 'MLStackAdjustedC', MLadjustedCount(MLStackedClassifier(cls()))
# yield 'MLprobAdjustedC', MLprobAdjustedCount(OneVsRestClassifier(calibratedCls()))
# yield 'MLStackProbAdjustedC', MLprobAdjustedCount(MLStackedClassifier(calibratedCls()))
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
@ -82,10 +111,10 @@ def models():
# yield 'LSP-ACC', MLACC(LabelSpacePartion(cls()))
# yield 'TwinSVM-CC', MLCC(MLTwinSVM())
# yield 'TwinSVM-ACC', MLACC(MLTwinSVM())
yield 'MLKNN-CC', MLCC(MLknn())
yield 'MLKNN-PCC', MLPCC(MLknn())
yield 'MLKNN-ACC', MLACC(MLknn())
yield 'MLKNN-PACC', MLPACC(MLknn())
# yield 'MLKNN-CC', MLCC(MLknn())
#yield 'MLKNN-PCC', MLPCC(MLknn())
# yield 'MLKNN-ACC', MLACC(MLknn())
#yield 'MLKNN-PACC', MLPACC(MLknn())
def get_dataset(dataset_name, dopickle=True):

View File

@ -1,7 +1,7 @@
from copy import deepcopy
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from skmultilearn.adapt import MLTSVM
@ -44,6 +44,25 @@ class MLStackedClassifier: # aka Funnelling Monolingual
return self.meta.predict_proba(P)
class MLStackedRegressor:
def __init__(self, base_regressor=Ridge(normalize=True)):
self.base = deepcopy(base_regressor)
self.meta = deepcopy(base_regressor)
def fit(self, X, y):
assert y.ndim==2, 'the dataset does not seem to be multi-label'
self.base.fit(X, y)
R = self.base.predict(X)
# R = self.norm.fit_transform(R)
self.meta.fit(R, y)
return self
def predict(self, X):
R = self.base.predict(X)
# R = self.norm.transform(R)
return self.meta.predict(R)
class LabelSpacePartion:
def __init__(self, base_estimator=LogisticRegression()):
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)

View File

@ -6,6 +6,7 @@ from sklearn.model_selection import train_test_split
from quapy.data import LabelledCollection
from quapy.functional import artificial_prevalence_sampling
from skmultilearn.model_selection import iterative_train_test_split
class MultilabelledCollection:
def __init__(self, instances, labels):
@ -67,10 +68,13 @@ class MultilabelledCollection:
labels = self.labels[index]
return MultilabelledCollection(documents, labels)
def train_test_split(self, train_prop=0.6, random_state=None):
#raise ValueError('use the scikit-multilearn implementation')
tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
def train_test_split(self, train_prop=0.6, random_state=None, iterative=False):
if iterative:
tr_docs, tr_labels, te_docs, te_labels = \
iterative_train_test_split(self.instances, self.labels, test_size=1-train_prop)
else:
tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1):
@ -98,6 +102,10 @@ class MultilabelledCollection:
for c in self.classes_:
yield self.asLabelledCollection(c)
# @property
# def label_cardinality(self):
# return self.labels.sum()/len(self)
@property
def Xy(self):
return self.instances, self.labels

View File

@ -1,6 +1,8 @@
import numpy as np
from copy import deepcopy
import sklearn.preprocessing
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
@ -9,7 +11,7 @@ from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, Mult
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
import quapy as qp
from MultiLabel.mlclassification import MLStackedClassifier
from MultiLabel.mlclassification import MLStackedClassifier, MLStackedRegressor
from MultiLabel.mldata import MultilabelledCollection
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
from method.base import BaseQuantifier
@ -166,13 +168,17 @@ class MLRegressionQuantification:
norm=True,
means=True,
stds=True):
assert regression in ['ridge', 'svr'], 'unknown regression model'
assert protocol in ['npp', 'app'], 'unknown protocol'
self.estimator = mlquantifier
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
if isinstance(regression, str):
assert regression in ['ridge', 'svr'], 'unknown regression model'
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
else:
self.reg = regression
self.protocol = protocol
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
@ -215,7 +221,7 @@ class MLRegressionQuantification:
Xs, ys = [], []
samples_mean, samples_std = [], []
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
self._extract_features(sample, Xs, ys, samples_mean, samples_std)
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
@ -227,7 +233,7 @@ class MLRegressionQuantification:
repeats = max(self.n_samples // (ncats * nprevs), 1)
for cat in self.classes_:
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
self._extract_features(self, sample, Xs, ys, samples_mean, samples_std)
self._extract_features(sample, Xs, ys, samples_mean, samples_std)
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def fit(self, data:MultilabelledCollection):
@ -259,4 +265,97 @@ class MLRegressionQuantification:
return np.asarray([neg_prevs, adjusted]).T
# class
class StackMLRQuantifier:
def __init__(self,
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
regression='ridge',
protocol='npp',
n_samples=500,
sample_size=500,
norm=True,
means=True,
stds=True):
if regression == 'ridge':
reg = MLStackedRegressor(Ridge(normalize=True))
elif regression == 'svr':
reg = MLStackedRegressor(MultiOutputRegressor(LinearSVR()))
else:
ValueError(f'unknown regressor {regression}')
self.base = MLRegressionQuantification(
mlquantifier=mlquantifier,
regression=reg,
protocol=protocol,
n_samples=n_samples,
sample_size=sample_size,
norm=norm,
means=means,
stds=stds)
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
self.base.fit(data)
return self
def quantify(self, instances):
return self.base.quantify(instances)
class MLadjustedCount(MLAggregativeQuantifier):
def __init__(self, learner):
self.learner = learner
def preclassify(self, instances):
return self.learner.predict(instances)
def fit(self, data: MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_predictions = self.preclassify(val.instances)
val_true = val.labels
N = len(val)
C = val_predictions.T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ]
priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...]
self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ]
return self
def aggregate(self, predictions):
P = sklearn.preprocessing.normalize(predictions, norm='l1')
correction = P.dot(self.Pte_cond_estim_)
adjusted = correction.mean(axis=0)
return np.asarray([1-adjusted, adjusted]).T
class MLprobAdjustedCount(MLAggregativeQuantifier):
def __init__(self, learner):
self.learner = learner
def preclassify(self, instances):
return self.learner.predict_proba(instances)
def fit(self, data: MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_predictions = self.preclassify(val.instances)
val_true = val.labels
N = len(val)
C = (val_predictions>0.5).T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ]
# not sure...
priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...]
self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ]
return self
def aggregate(self, predictions):
P = sklearn.preprocessing.normalize(predictions, norm='l1')
correction = P.dot(self.Pte_cond_estim_)
adjusted = correction.mean(axis=0)
return np.asarray([1-adjusted, adjusted]).T