forked from moreo/QuaPy
cleaning
This commit is contained in:
parent
168c109794
commit
1aafd10e25
|
@ -1,174 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
from sklearn.base import BaseEstimator
|
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
from sklearn.preprocessing import StandardScaler
|
|
||||||
|
|
||||||
import quapy as qp
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from quapy.data import LabelledCollection
|
|
||||||
from quapy.method.base import BaseQuantifier, BinaryQuantifier
|
|
||||||
from quapy.method.aggregative import PACC, EMQ, HDy
|
|
||||||
import quapy.functional as F
|
|
||||||
from tqdm import tqdm
|
|
||||||
from scipy.sparse import issparse, csr_matrix
|
|
||||||
import scipy
|
|
||||||
|
|
||||||
|
|
||||||
class PACCSLD(PACC):
|
|
||||||
"""
|
|
||||||
This method combines the EMQ improved posterior probabilities with PACC.
|
|
||||||
Note: the posterior probabilities are re-calibrated with EMQ only during prediction, and not also during fit since,
|
|
||||||
for PACC, the validation split is known to have the same prevalence as the training set (this is because the split
|
|
||||||
is stratified) and thus the posterior probabilities should not be re-calibrated for a different prior (it actually
|
|
||||||
happens to degrades performance).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def fit(self, data: qp.data.LabelledCollection, fit_learner=True, val_split:Union[float, int, qp.data.LabelledCollection]=0.4):
|
|
||||||
self.train_prevalence = F.prevalence_from_labels(data.labels, data.n_classes)
|
|
||||||
return super(PACCSLD, self).fit(data, fit_learner, val_split)
|
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors):
|
|
||||||
priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4)
|
|
||||||
return super(PACCSLD, self).aggregate(posteriors)
|
|
||||||
|
|
||||||
|
|
||||||
class HDySLD(HDy):
|
|
||||||
"""
|
|
||||||
This method combines the EMQ improved posterior probabilities with HDy.
|
|
||||||
Note: [same as PACCSLD]
|
|
||||||
"""
|
|
||||||
def fit(self, data: qp.data.LabelledCollection, fit_learner=True,
|
|
||||||
val_split: Union[float, int, qp.data.LabelledCollection] = 0.4):
|
|
||||||
self.train_prevalence = F.prevalence_from_labels(data.labels, data.n_classes)
|
|
||||||
return super(HDySLD, self).fit(data, fit_learner, val_split)
|
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors):
|
|
||||||
priors, posteriors = EMQ.EM(self.train_prevalence, classif_posteriors, epsilon=1e-4)
|
|
||||||
return super(HDySLD, self).aggregate(posteriors)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class AveragePoolQuantification(BinaryQuantifier):
|
|
||||||
def __init__(self, learner, sample_size, trials, n_components=-1, zscore=False):
|
|
||||||
self.learner = learner
|
|
||||||
self.sample_size = sample_size
|
|
||||||
self.trials = trials
|
|
||||||
|
|
||||||
self.do_zscore = zscore
|
|
||||||
self.zscore = StandardScaler() if self.do_zscore else None
|
|
||||||
|
|
||||||
self.do_pca = n_components>0
|
|
||||||
self.pca = PCA(n_components) if self.do_pca else None
|
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection):
|
|
||||||
training, validation = data.split_stratified(train_prop=0.7)
|
|
||||||
|
|
||||||
X, y = [], []
|
|
||||||
|
|
||||||
nprevpoints = F.get_nprevpoints_approximation(self.trials, data.n_classes)
|
|
||||||
for sample in tqdm(
|
|
||||||
training.artificial_sampling_generator(self.sample_size, n_prevalences=nprevpoints, repeats=1),
|
|
||||||
desc='generating averages'
|
|
||||||
):
|
|
||||||
X.append(sample.instances.mean(axis=0))
|
|
||||||
y.append(sample.prevalence()[1])
|
|
||||||
while len(X) < self.trials:
|
|
||||||
sample = training.sampling(self.sample_size, F.uniform_simplex_sampling(data.n_classes))
|
|
||||||
X.append(sample.instances.mean(axis=0))
|
|
||||||
y.append(sample.prevalence())
|
|
||||||
X = np.asarray(np.vstack(X))
|
|
||||||
y = np.asarray(y)
|
|
||||||
|
|
||||||
if self.do_pca:
|
|
||||||
X = self.pca.fit_transform(X)
|
|
||||||
print(X.shape)
|
|
||||||
|
|
||||||
if self.do_zscore:
|
|
||||||
X = self.zscore.fit_transform(X)
|
|
||||||
|
|
||||||
print('training regressor...')
|
|
||||||
self.regressor = self.learner.fit(X, y)
|
|
||||||
|
|
||||||
# correction at 0:
|
|
||||||
print('getting corrections...')
|
|
||||||
X0 = np.asarray(np.vstack([validation.sampling(self.sample_size, 0., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
|
|
||||||
X1 = np.asarray(np.vstack([validation.sampling(self.sample_size, 1., shuffle=False).instances.mean(axis=0) for _ in range(100)]))
|
|
||||||
|
|
||||||
if self.do_pca:
|
|
||||||
X0 = self.pca.transform(X0)
|
|
||||||
X1 = self.pca.transform(X1)
|
|
||||||
|
|
||||||
if self.do_zscore:
|
|
||||||
X0 = self.zscore.transform(X0)
|
|
||||||
X1 = self.zscore.transform(X1)
|
|
||||||
|
|
||||||
self.correction_0 = self.regressor.predict(X0).mean()
|
|
||||||
self.correction_1 = self.regressor.predict(X1).mean()
|
|
||||||
|
|
||||||
print('correction-0', self.correction_0)
|
|
||||||
print('correction-1', self.correction_1)
|
|
||||||
print('done')
|
|
||||||
|
|
||||||
def quantify(self, instances):
|
|
||||||
ave = np.asarray(instances.mean(axis=0))
|
|
||||||
|
|
||||||
if self.do_pca:
|
|
||||||
ave = self.pca.transform(ave)
|
|
||||||
if self.do_zscore:
|
|
||||||
ave = self.zscore.transform(ave)
|
|
||||||
phat = self.regressor.predict(ave).item()
|
|
||||||
phat = np.clip((phat-self.correction_0)/(self.correction_1-self.correction_0), 0, 1)
|
|
||||||
return np.asarray([1-phat, phat])
|
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
|
||||||
self.learner.set_params(**parameters)
|
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
|
||||||
return self.learner.get_params(deep=deep)
|
|
||||||
|
|
||||||
|
|
||||||
class WinnowOrthogonal(BaseEstimator):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def fit(self, X, y):
|
|
||||||
self.classes_ = np.asarray(sorted(np.unique(y)))
|
|
||||||
w1 = np.asarray(X[y == 0].mean(axis=0)).flatten()
|
|
||||||
w2 = np.asarray(X[y == 1].mean(axis=0)).flatten()
|
|
||||||
diff = w2 - w1
|
|
||||||
orth = np.ones_like(diff)
|
|
||||||
orth[0] = -diff[1:].sum() / diff[0]
|
|
||||||
orth /= np.linalg.norm(orth)
|
|
||||||
self.w = orth
|
|
||||||
self.b = w1.dot(orth)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def decision_function(self, X):
|
|
||||||
if issparse(X):
|
|
||||||
Z = X.dot(csr_matrix(self.w).T).toarray().flatten()
|
|
||||||
return Z - self.b
|
|
||||||
else:
|
|
||||||
return np.matmul(X, self.w) - self.b
|
|
||||||
|
|
||||||
def predict(self, X):
|
|
||||||
return 1 * (self.decision_function(X) > 0)
|
|
||||||
|
|
||||||
def split(self, X, y):
|
|
||||||
s = self.predict(X)
|
|
||||||
X0a = X[np.logical_and(y == 0, s == 0)]
|
|
||||||
X0b = X[np.logical_and(y == 0, s == 1)]
|
|
||||||
X1a = X[np.logical_and(y == 1, s == 0)]
|
|
||||||
X1b = X[np.logical_and(y == 1, s == 1)]
|
|
||||||
y0a = np.zeros(X0a.shape[0], dtype=np.int)
|
|
||||||
y0b = np.zeros(X0b.shape[0], dtype=np.int)
|
|
||||||
y1a = np.ones(X1a.shape[0], dtype=np.int)
|
|
||||||
y1b = np.ones(X1b.shape[0], dtype=np.int)
|
|
||||||
return X0a, X0b, X1a, X1b, y0a, y0b, y1a, y1b
|
|
||||||
|
|
||||||
def get_params(self):
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def set_params(self, **params):
|
|
||||||
pass
|
|
|
@ -1,48 +0,0 @@
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
import quapy as qp
|
|
||||||
from classification.methods import PCALR
|
|
||||||
from method.meta import QuaNet
|
|
||||||
from quapy.method.aggregative import *
|
|
||||||
from NewMethods.methods import *
|
|
||||||
from experiments import run, SAMPLE_SIZE
|
|
||||||
import numpy as np
|
|
||||||
import itertools
|
|
||||||
from joblib import Parallel, delayed
|
|
||||||
import settings
|
|
||||||
import argparse
|
|
||||||
import torch
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
|
|
||||||
parser.add_argument('results', metavar='RESULT_PATH', type=str, help='path to the directory where to store the results')
|
|
||||||
#parser.add_argument('svmperfpath', metavar='SVMPERF_PATH', type=str, help='path to the directory with svmperf')
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def quantification_models():
|
|
||||||
def newLR():
|
|
||||||
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
|
|
||||||
__C_range = np.logspace(-4, 5, 10)
|
|
||||||
lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
|
|
||||||
svmperf_params = {'C': __C_range}
|
|
||||||
#yield 'paccsld', PACCSLD(newLR()), lr_params
|
|
||||||
yield 'hdysld', OneVsAll(HDySLD(newLR())), lr_params # <-- promising!
|
|
||||||
|
|
||||||
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
||||||
#print(f'Running QuaNet in {device}')
|
|
||||||
#yield 'quanet', QuaNet(PCALR(**newLR().get_params()), SAMPLE_SIZE, device=device), lr_params
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
print(f'Result folder: {args.results}')
|
|
||||||
np.random.seed(0)
|
|
||||||
|
|
||||||
optim_losses = ['mae']
|
|
||||||
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN
|
|
||||||
models = quantification_models()
|
|
||||||
|
|
||||||
results = Parallel(n_jobs=settings.N_JOBS)(
|
|
||||||
delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
|
@ -1,148 +0,0 @@
|
||||||
import quapy as qp
|
|
||||||
import numpy as np
|
|
||||||
from os import makedirs
|
|
||||||
import sys, os
|
|
||||||
import pickle
|
|
||||||
from experiments import result_path
|
|
||||||
from gen_tables import save_table, experiment_errors
|
|
||||||
from tabular import Table
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
tables_path = './tables'
|
|
||||||
MAXTONE = 50 # sets the intensity of the maximum color reached by the worst (red) and best (green) results
|
|
||||||
|
|
||||||
makedirs(tables_path, exist_ok=True)
|
|
||||||
|
|
||||||
sample_size = 100
|
|
||||||
qp.environ['SAMPLE_SIZE'] = sample_size
|
|
||||||
|
|
||||||
|
|
||||||
nice = {
|
|
||||||
'mae':'AE',
|
|
||||||
'mrae':'RAE',
|
|
||||||
'ae':'AE',
|
|
||||||
'rae':'RAE',
|
|
||||||
'svmkld': 'SVM(KLD)',
|
|
||||||
'svmnkld': 'SVM(NKLD)',
|
|
||||||
'svmq': 'SVM(Q)',
|
|
||||||
'svmae': 'SVM(AE)',
|
|
||||||
'svmnae': 'SVM(NAE)',
|
|
||||||
'svmmae': 'SVM(AE)',
|
|
||||||
'svmmrae': 'SVM(RAE)',
|
|
||||||
'quanet': 'QuaNet',
|
|
||||||
'hdy': 'HDy',
|
|
||||||
'hdysld': 'HDy-SLD',
|
|
||||||
'dys': 'DyS',
|
|
||||||
'svmperf':'',
|
|
||||||
'sanders': 'Sanders',
|
|
||||||
'semeval13': 'SemEval13',
|
|
||||||
'semeval14': 'SemEval14',
|
|
||||||
'semeval15': 'SemEval15',
|
|
||||||
'semeval16': 'SemEval16',
|
|
||||||
'Average': 'Average'
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def nicerm(key):
|
|
||||||
return '\mathrm{'+nice[key]+'}'
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(description='Generate tables for Tweeter Sentiment Quantification')
|
|
||||||
parser.add_argument('results', metavar='RESULT_PATH', type=str,
|
|
||||||
help='path to the directory containing the results of the methods tested in Gao & Sebastiani')
|
|
||||||
parser.add_argument('newresults', metavar='RESULT_PATH', type=str,
|
|
||||||
help='path to the directory containing the results for the experimental methods')
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
|
|
||||||
evaluation_measures = [qp.error.ae, qp.error.rae]
|
|
||||||
gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
|
|
||||||
new_methods = ['hdy'] # methods added to the Gao & Sebastiani methods
|
|
||||||
experimental_methods = ['hdysld'] # experimental
|
|
||||||
|
|
||||||
for i, eval_func in enumerate(evaluation_measures):
|
|
||||||
|
|
||||||
# Tables evaluation scores for AE and RAE (two tables)
|
|
||||||
# ----------------------------------------------------
|
|
||||||
|
|
||||||
eval_name = eval_func.__name__
|
|
||||||
|
|
||||||
added_methods = ['svmm' + eval_name] + new_methods
|
|
||||||
methods = gao_seb_methods + added_methods + experimental_methods
|
|
||||||
nold_methods = len(gao_seb_methods)
|
|
||||||
nnew_methods = len(added_methods)
|
|
||||||
nexp_methods = len(experimental_methods)
|
|
||||||
|
|
||||||
# fill data table
|
|
||||||
table = Table(benchmarks=datasets, methods=methods)
|
|
||||||
for dataset in datasets:
|
|
||||||
for method in methods:
|
|
||||||
if method in experimental_methods:
|
|
||||||
path = args.newresults
|
|
||||||
else:
|
|
||||||
path = args.results
|
|
||||||
table.add(dataset, method, experiment_errors(path, dataset, method, eval_name))
|
|
||||||
|
|
||||||
# write the latex table
|
|
||||||
tabular = """
|
|
||||||
\\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline
|
|
||||||
& \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} &
|
|
||||||
\multicolumn{"""+str(nnew_methods)+"""}{c|}{} &
|
|
||||||
\multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline
|
|
||||||
"""
|
|
||||||
rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets}
|
|
||||||
colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods}
|
|
||||||
|
|
||||||
tabular += table.latexTabular(benchmark_replace=rowreplace, method_replace=colreplace)
|
|
||||||
tabular += "\n\end{tabularx}"
|
|
||||||
|
|
||||||
save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular)
|
|
||||||
|
|
||||||
# Tables ranks for AE and RAE (two tables)
|
|
||||||
# ----------------------------------------------------
|
|
||||||
# fill the data table
|
|
||||||
ranktable = Table(benchmarks=datasets, methods=methods, missing='--')
|
|
||||||
for dataset in datasets:
|
|
||||||
for method in methods:
|
|
||||||
ranktable.add(dataset, method, values=table.get(dataset, method, 'rank'))
|
|
||||||
|
|
||||||
# write the latex table
|
|
||||||
tabular = """
|
|
||||||
\\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline
|
|
||||||
& \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} &
|
|
||||||
\multicolumn{"""+str(nnew_methods)+"""}{c|}{} &
|
|
||||||
\multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline
|
|
||||||
"""
|
|
||||||
for method in methods:
|
|
||||||
tabular += ' & \side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} '
|
|
||||||
tabular += '\\\\\hline\n'
|
|
||||||
|
|
||||||
for dataset in datasets:
|
|
||||||
tabular += nice.get(dataset, dataset.upper()) + ' '
|
|
||||||
for method in methods:
|
|
||||||
newrank = ranktable.get(dataset, method)
|
|
||||||
if newrank != '--':
|
|
||||||
newrank = f'{int(newrank)}'
|
|
||||||
color = ranktable.get_color(dataset, method)
|
|
||||||
if color == '--':
|
|
||||||
color = ''
|
|
||||||
tabular += ' & ' + f'{newrank}' + color
|
|
||||||
tabular += '\\\\\hline\n'
|
|
||||||
tabular += '\hline\n'
|
|
||||||
|
|
||||||
tabular += 'Average '
|
|
||||||
for method in methods:
|
|
||||||
newrank = ranktable.get_average(method)
|
|
||||||
if newrank != '--':
|
|
||||||
newrank = f'{newrank:.1f}'
|
|
||||||
color = ranktable.get_average(method, 'color')
|
|
||||||
if color == '--':
|
|
||||||
color = ''
|
|
||||||
tabular += ' & ' + f'{newrank}' + color
|
|
||||||
tabular += '\\\\\hline\n'
|
|
||||||
tabular += "\end{tabularx}"
|
|
||||||
|
|
||||||
save_table(f'./tables/tab_rank_{eval_name}.new.tex', tabular)
|
|
||||||
|
|
||||||
print("[Done]")
|
|
|
@ -1,4 +0,0 @@
|
||||||
import multiprocessing
|
|
||||||
|
|
||||||
N_JOBS = -2 #multiprocessing.cpu_count()
|
|
||||||
SAMPLE_SIZE = 100
|
|
Loading…
Reference in New Issue