experimental method ave-pool, not working due to the fact that onevsall is aggregative and ave-pool is not

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-20 17:03:12 +01:00
parent 99132c8166
commit 1ba0748b59
4 changed files with 106 additions and 14 deletions

View File

@ -1,7 +1,15 @@
from typing import Union import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import quapy as qp import quapy as qp
from typing import Union
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.method.aggregative import PACC, EMQ, HDy from quapy.method.aggregative import PACC, EMQ, HDy
import quapy.functional as F import quapy.functional as F
from tqdm import tqdm
class PACCSLD(PACC): class PACCSLD(PACC):
@ -35,3 +43,83 @@ class HDySLD(HDy):
def aggregate(self, classif_posteriors): def aggregate(self, classif_posteriors):
priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4) priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4)
return super(HDySLD, self).aggregate(posteriors) return super(HDySLD, self).aggregate(posteriors)
class AveragePoolQuantification(BinaryQuantifier):
def __init__(self, learner, sample_size, trials, n_components=-1, zscore=False):
self.learner = learner
self.sample_size = sample_size
self.trials = trials
self.do_zscore = zscore
self.zscore = StandardScaler() if self.do_zscore else None
self.do_pca = n_components>0
self.pca = PCA(n_components) if self.do_pca else None
def fit(self, data: LabelledCollection):
training, validation = data.split_stratified(train_prop=0.7)
X, y = [], []
nprevpoints = F.get_nprevpoints_approximation(self.trials, data.n_classes)
for sample in tqdm(
training.artificial_sampling_generator(self.sample_size, n_prevalences=nprevpoints, repeats=1),
desc='generating averages'
):
X.append(sample.instances.mean(axis=0))
y.append(sample.prevalence()[1])
while len(X) < self.trials:
sample = training.sampling(self.sample_size, F.uniform_simplex_sampling(data.n_classes))
X.append(sample.instances.mean(axis=0))
y.append(sample.prevalence())
X = np.asarray(np.vstack(X))
y = np.asarray(y)
if self.do_pca:
X = self.pca.fit_transform(X)
print(X.shape)
if self.do_zscore:
X = self.zscore.fit_transform(X)
print('training regressor...')
self.regressor = self.learner.fit(X, y)
# correction at 0:
print('getting corrections...')
X0 = np.asarray(np.vstack([validation.sampling(self.sample_size, 0., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
X1 = np.asarray(np.vstack([validation.sampling(self.sample_size, 1., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
if self.do_pca:
X0 = self.pca.transform(X0)
X1 = self.pca.transform(X1)
if self.do_zscore:
X0 = self.zscore.transform(X0)
X1 = self.zscore.transform(X1)
self.correction_0 = self.regressor.predict(X0).mean()
self.correction_1 = self.regressor.predict(X1).mean()
print('correction-0', self.correction_0)
print('correction-1', self.correction_1)
print('done')
def quantify(self, instances):
ave = np.asarray(instances.mean(axis=0))
if self.do_pca:
ave = self.pca.transform(ave)
if self.do_zscore:
ave = self.zscore.transform(ave)
phat = self.regressor.predict(ave).item()
phat = np.clip((phat-self.correction_0)/(self.correction_1-self.correction_0), 0, 1)
return np.asarray([1-phat, phat])
def set_params(self, **parameters):
self.learner.set_params(**parameters)
def get_params(self, deep=True):
return self.learner.get_params(deep=deep)

View File

@ -32,6 +32,7 @@ nice = {
'quanet': 'QuaNet', 'quanet': 'QuaNet',
'hdy': 'HDy', 'hdy': 'HDy',
'dys': 'DyS', 'dys': 'DyS',
'epaccmaeptr': 'E(PACC)$_\mathrm{Ptr}$',
'svmperf':'', 'svmperf':'',
'sanders': 'Sanders', 'sanders': 'Sanders',
'semeval13': 'SemEval13', 'semeval13': 'SemEval13',
@ -116,7 +117,7 @@ if __name__ == '__main__':
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
evaluation_measures = [qp.error.ae, qp.error.rae] evaluation_measures = [qp.error.ae, qp.error.rae]
gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld'] gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
new_methods = ['hdy'] new_methods = ['hdy', 'quanet', 'epaccptr']
gao_seb_ranks, gao_seb_results = get_ranks_from_Gao_Sebastiani() gao_seb_ranks, gao_seb_results = get_ranks_from_Gao_Sebastiani()

View File

@ -547,8 +547,6 @@ class OneVsAll(AggregativeQuantifier):
else: else:
predictions = self.classify(X) predictions = self.classify(X)
return self.aggregate(predictions) return self.aggregate(predictions)
#prevalences = self.__parallel(self._delayed_binary_quantify, X)
#return F.normalize_prevalence(prevalences)
def __parallel(self, func, *args, **kwargs): def __parallel(self, func, *args, **kwargs):
return np.asarray( return np.asarray(

25
test.py
View File

@ -1,10 +1,12 @@
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC, LinearSVR
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
import sys import sys
import numpy as np import numpy as np
from NewMethods.methods import AveragePoolQuantification
from classification.methods import PCALR from classification.methods import PCALR
from classification.neural import NeuralClassifierTrainer, CNNnet from classification.neural import NeuralClassifierTrainer, CNNnet
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
@ -29,7 +31,7 @@ if binary:
else: else:
dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True) dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True)
dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3) #dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3)
print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}') print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
@ -51,14 +53,17 @@ print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.tes
#model = qp.method.meta.QuaNet(learner, sample_size, device='cpu') #model = qp.method.meta.QuaNet(learner, sample_size, device='cpu')
#learner = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, n_jobs=-1, verbose=1) #learner = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, n_jobs=-1, verbose=1)
learner = LogisticRegression(max_iter=1000) #learner = LogisticRegression(max_iter=1000)
# model = qp.method.aggregative.ClassifyAndCount(learner) # model = qp.method.aggregative.ClassifyAndCount(learner)
model = qp.method.meta.EPACC(learner, size=10, red_size=5, #model = qp.method.meta.EPACC(learner, size=10, red_size=5,
param_grid={'C':[1,10,100]}, # param_grid={'C':[1,10,100]},
optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5}, # optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5},
policy='ptr', n_jobs=1) # policy='ptr', n_jobs=1)
regressor = LinearSVR(max_iter=10000)
param_grid = {'C': np.logspace(-1,3,5)}
model = AveragePoolQuantification(regressor, sample_size, trials=5000, n_components=500, zscore=False)
#model = qp.method.meta.EHDy(learner, param_grid=param_grid, optim='mae', #model = qp.method.meta.EHDy(learner, param_grid=param_grid, optim='mae',
# sample_size=sample_size, eval_budget=max_evaluations//10, n_jobs=-1) # sample_size=sample_size, eval_budget=max_evaluations//10, n_jobs=-1)
@ -75,7 +80,7 @@ if qp.isbinary(model) and not qp.isbinary(dataset):
print(f'fitting model {model.__class__.__name__}') print(f'fitting model {model.__class__.__name__}')
#train, val = dataset.training.split_stratified(0.6) #train, val = dataset.training.split_stratified(0.6)
#model.fit(train, val_split=val) #model.fit(train, val_split=val)
model.fit(dataset.training, val_split=dataset.test) model.fit(dataset.training)
@ -112,7 +117,7 @@ for error in qp.error.QUANTIFICATION_ERROR:
score = error(true_prev, estim_prev) score = error(true_prev, estim_prev)
print(f'{error.__name__}={score:.5f}') print(f'{error.__name__}={score:.5f}')
sys.exit(0) #sys.exit(0)
# Model selection and Evaluation according to the artificial sampling protocol # Model selection and Evaluation according to the artificial sampling protocol
# ---------------------------------------------------------------------------- # ----------------------------------------------------------------------------
@ -123,7 +128,7 @@ model_selection = GridSearchQ(model,
error='mae', error='mae',
refit=True, refit=True,
verbose=True, verbose=True,
timeout=4) timeout=60*60)
model = model_selection.fit(dataset.training, val_split=0.3) model = model_selection.fit(dataset.training, val_split=0.3)
#model = model_selection.fit(train, validation=val) #model = model_selection.fit(train, validation=val)