diff --git a/NewMethods/methods.py b/NewMethods/methods.py index 042422e..907d067 100644 --- a/NewMethods/methods.py +++ b/NewMethods/methods.py @@ -1,7 +1,15 @@ -from typing import Union +import numpy as np +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler + import quapy as qp +from typing import Union + +from quapy.data import LabelledCollection +from quapy.method.base import BaseQuantifier, BinaryQuantifier from quapy.method.aggregative import PACC, EMQ, HDy import quapy.functional as F +from tqdm import tqdm class PACCSLD(PACC): @@ -35,3 +43,83 @@ class HDySLD(HDy): def aggregate(self, classif_posteriors): priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4) return super(HDySLD, self).aggregate(posteriors) + + + +class AveragePoolQuantification(BinaryQuantifier): + def __init__(self, learner, sample_size, trials, n_components=-1, zscore=False): + self.learner = learner + self.sample_size = sample_size + self.trials = trials + + self.do_zscore = zscore + self.zscore = StandardScaler() if self.do_zscore else None + + self.do_pca = n_components>0 + self.pca = PCA(n_components) if self.do_pca else None + + def fit(self, data: LabelledCollection): + training, validation = data.split_stratified(train_prop=0.7) + + X, y = [], [] + + nprevpoints = F.get_nprevpoints_approximation(self.trials, data.n_classes) + for sample in tqdm( + training.artificial_sampling_generator(self.sample_size, n_prevalences=nprevpoints, repeats=1), + desc='generating averages' + ): + X.append(sample.instances.mean(axis=0)) + y.append(sample.prevalence()[1]) + while len(X) < self.trials: + sample = training.sampling(self.sample_size, F.uniform_simplex_sampling(data.n_classes)) + X.append(sample.instances.mean(axis=0)) + y.append(sample.prevalence()) + X = np.asarray(np.vstack(X)) + y = np.asarray(y) + + if self.do_pca: + X = self.pca.fit_transform(X) + print(X.shape) + + if self.do_zscore: + X = self.zscore.fit_transform(X) + + print('training regressor...') + self.regressor = self.learner.fit(X, y) + + # correction at 0: + print('getting corrections...') + X0 = np.asarray(np.vstack([validation.sampling(self.sample_size, 0., shuffle=False).instances.mean(axis=0) for _ in range(100)])) + X1 = np.asarray(np.vstack([validation.sampling(self.sample_size, 1., shuffle=False).instances.mean(axis=0) for _ in range(100)])) + + if self.do_pca: + X0 = self.pca.transform(X0) + X1 = self.pca.transform(X1) + + if self.do_zscore: + X0 = self.zscore.transform(X0) + X1 = self.zscore.transform(X1) + + self.correction_0 = self.regressor.predict(X0).mean() + self.correction_1 = self.regressor.predict(X1).mean() + + print('correction-0', self.correction_0) + print('correction-1', self.correction_1) + print('done') + + def quantify(self, instances): + ave = np.asarray(instances.mean(axis=0)) + + if self.do_pca: + ave = self.pca.transform(ave) + if self.do_zscore: + ave = self.zscore.transform(ave) + phat = self.regressor.predict(ave).item() + phat = np.clip((phat-self.correction_0)/(self.correction_1-self.correction_0), 0, 1) + return np.asarray([1-phat, phat]) + + def set_params(self, **parameters): + self.learner.set_params(**parameters) + + def get_params(self, deep=True): + return self.learner.get_params(deep=deep) diff --git a/TweetSentQuant/gen_tables.py b/TweetSentQuant/gen_tables.py index 3637d64..16dadee 100644 --- a/TweetSentQuant/gen_tables.py +++ b/TweetSentQuant/gen_tables.py @@ -32,6 +32,7 @@ nice = { 'quanet': 'QuaNet', 'hdy': 'HDy', 'dys': 'DyS', + 'epaccmaeptr': 'E(PACC)$_\mathrm{Ptr}$', 'svmperf':'', 'sanders': 'Sanders', 'semeval13': 'SemEval13', @@ -116,7 +117,7 @@ if __name__ == '__main__': datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST evaluation_measures = [qp.error.ae, qp.error.rae] gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld'] - new_methods = ['hdy'] + new_methods = ['hdy', 'quanet', 'epaccptr'] gao_seb_ranks, gao_seb_results = get_ranks_from_Gao_Sebastiani() diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index c974562..57c2467 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -547,8 +547,6 @@ class OneVsAll(AggregativeQuantifier): else: predictions = self.classify(X) return self.aggregate(predictions) - #prevalences = self.__parallel(self._delayed_binary_quantify, X) - #return F.normalize_prevalence(prevalences) def __parallel(self, func, *args, **kwargs): return np.asarray( diff --git a/test.py b/test.py index 74bb454..4eed0e7 100644 --- a/test.py +++ b/test.py @@ -1,10 +1,12 @@ from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV -from sklearn.svm import LinearSVC +from sklearn.svm import LinearSVC, LinearSVR import quapy as qp import quapy.functional as F import sys import numpy as np + +from NewMethods.methods import AveragePoolQuantification from classification.methods import PCALR from classification.neural import NeuralClassifierTrainer, CNNnet from quapy.model_selection import GridSearchQ @@ -29,7 +31,7 @@ if binary: else: dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True) - dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3) + #dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3) print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}') @@ -51,14 +53,17 @@ print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.tes #model = qp.method.meta.QuaNet(learner, sample_size, device='cpu') #learner = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, n_jobs=-1, verbose=1) -learner = LogisticRegression(max_iter=1000) +#learner = LogisticRegression(max_iter=1000) # model = qp.method.aggregative.ClassifyAndCount(learner) -model = qp.method.meta.EPACC(learner, size=10, red_size=5, - param_grid={'C':[1,10,100]}, - optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5}, - policy='ptr', n_jobs=1) +#model = qp.method.meta.EPACC(learner, size=10, red_size=5, +# param_grid={'C':[1,10,100]}, +# optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5}, +# policy='ptr', n_jobs=1) +regressor = LinearSVR(max_iter=10000) +param_grid = {'C': np.logspace(-1,3,5)} +model = AveragePoolQuantification(regressor, sample_size, trials=5000, n_components=500, zscore=False) #model = qp.method.meta.EHDy(learner, param_grid=param_grid, optim='mae', # sample_size=sample_size, eval_budget=max_evaluations//10, n_jobs=-1) @@ -75,7 +80,7 @@ if qp.isbinary(model) and not qp.isbinary(dataset): print(f'fitting model {model.__class__.__name__}') #train, val = dataset.training.split_stratified(0.6) #model.fit(train, val_split=val) -model.fit(dataset.training, val_split=dataset.test) +model.fit(dataset.training) @@ -112,7 +117,7 @@ for error in qp.error.QUANTIFICATION_ERROR: score = error(true_prev, estim_prev) print(f'{error.__name__}={score:.5f}') -sys.exit(0) +#sys.exit(0) # Model selection and Evaluation according to the artificial sampling protocol # ---------------------------------------------------------------------------- @@ -123,7 +128,7 @@ model_selection = GridSearchQ(model, error='mae', refit=True, verbose=True, - timeout=4) + timeout=60*60) model = model_selection.fit(dataset.training, val_split=0.3) #model = model_selection.fit(train, validation=val)