1
0
Fork 0

kde working with kfcv

This commit is contained in:
Alejandro Moreo Fernandez 2023-07-24 16:28:21 +02:00
parent 0ce6106ee1
commit 96758834f3
6 changed files with 184 additions and 102 deletions

View File

@ -27,7 +27,7 @@ if __name__ == '__main__':
'classifier__class_weight': ['balanced', None]
}
for method in ['PACC', 'SLD', 'DM', 'KDE', 'HDy', 'DIR']:
for method in ['KDE', 'PACC', 'SLD', 'DM', 'HDy-OvA', 'DIR']:
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
@ -43,7 +43,7 @@ if __name__ == '__main__':
dataset = 'T1B'
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
print('init', dataset)
print(f'init {dataset} #instances: {len(train)}')
if method == 'KDE':
param_grid = {
'bandwidth': np.linspace(0.001, 0.2, 21),
@ -51,6 +51,11 @@ if __name__ == '__main__':
'classifier__class_weight': ['balanced', None]
}
quantifier = KDEy(LogisticRegression(), target='max_likelihood')
elif method == 'KDE-debug':
param_grid = None
qp.environ['N_JOBS'] = 1
quantifier = KDEy(LogisticRegression(), target='max_likelihood', bandwidth=0.02)
#train = train.sampling(280, *[1./train.n_classes]*(train.n_classes-1))
elif method == 'DIR':
param_grid = hyper_LR
quantifier = DIRy(LogisticRegression())
@ -62,7 +67,7 @@ if __name__ == '__main__':
quantifier = PACC(LogisticRegression())
elif method == 'HDy-OvA':
param_grid = {
'binary_quantifier__classifier__C': np.logspace(-4,4,9),
'binary_quantifier__classifier__C': np.logspace(-3,3,9),
'binary_quantifier__classifier__class_weight': ['balanced', None]
}
quantifier = OneVsAllAggregative(HDy(LogisticRegression()))
@ -76,13 +81,17 @@ if __name__ == '__main__':
else:
raise NotImplementedError('unknown method', method)
modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
if param_grid is not None:
modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
modsel.fit(train)
print(f'best params {modsel.best_params_}')
pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
modsel.fit(train)
print(f'best params {modsel.best_params_}')
pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
quantifier = modsel.best_model()
else:
print('debug mode... skipping model selection')
quantifier.fit(train)
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
means = report.mean()

View File

@ -6,12 +6,13 @@ import sys
import pandas as pd
import quapy as qp
from quapy.method.aggregative import EMQ, DistributionMatching, PACC, HDy, OneVsAllAggregative
from quapy.method.aggregative import EMQ, DistributionMatching, PACC, ACC, CC, PCC, HDy, OneVsAllAggregative
from method_kdey import KDEy
from method_dirichlety import DIRy
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
SEED=1
if __name__ == '__main__':
@ -29,7 +30,7 @@ if __name__ == '__main__':
'classifier__class_weight': ['balanced', None]
}
for method in ['PACC', 'SLD', 'DM', 'KDE', 'HDy', 'DIR']:
for method in ['KDE-nomonte', 'KDE-monte2', 'SLD', 'KDE-kfcv']:# , 'DIR', 'DM', 'HDy-OvA', 'CC', 'ACC', 'PCC']:
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
@ -49,69 +50,100 @@ if __name__ == '__main__':
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
print('init', dataset)
is_semeval = dataset.startswith('semeval')
if not is_semeval or not semeval_trained:
if method == 'KDE':
param_grid = {
'bandwidth': np.linspace(0.001, 0.2, 21),
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = KDEy(LogisticRegression(), target='max_likelihood')
elif method == 'DIR':
param_grid = hyper_LR
quantifier = DIRy(LogisticRegression())
elif method == 'SLD':
param_grid = hyper_LR
quantifier = EMQ(LogisticRegression())
elif method == 'PACC':
param_grid = hyper_LR
quantifier = PACC(LogisticRegression())
elif method == 'HDy-OvA':
param_grid = {
'binary_quantifier__classifier__C': np.logspace(-4,4,9),
'binary_quantifier__classifier__class_weight': ['balanced', None]
}
quantifier = OneVsAllAggregative(HDy(LogisticRegression()))
elif method == 'DM':
param_grid = {
'nbins': [5,10,15],
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = DistributionMatching(LogisticRegression())
else:
raise NotImplementedError('unknown method', method)
# model selection
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
protocol = UPP(data.test, repeats=n_bags_val)
modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
modsel.fit(data.training)
print(f'best params {modsel.best_params_}')
pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
if is_semeval:
semeval_trained = True
else:
print(f'model selection for {dataset} already done; skipping')
with qp.util.temp_seed(SEED):
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
report.to_csv(result_path+'.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
is_semeval = dataset.startswith('semeval')
if not is_semeval or not semeval_trained:
if method == 'KDE':
param_grid = {
'bandwidth': np.linspace(0.001, 0.2, 21),
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = KDEy(LogisticRegression(), target='max_likelihood')
elif method == 'KDE-kfcv':
param_grid = {
'bandwidth': np.linspace(0.001, 0.2, 21),
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10)
elif method in ['KDE-monte2']:
param_grid = {
'bandwidth': np.linspace(0.001, 0.2, 21),
}
quantifier = KDEy(LogisticRegression(), target='min_divergence')
elif method in ['KDE-nomonte']:
param_grid = {
'bandwidth': np.linspace(0.001, 0.2, 21),
}
quantifier = KDEy(LogisticRegression(), target='max_likelihood')
elif method == 'DIR':
param_grid = hyper_LR
quantifier = DIRy(LogisticRegression())
elif method == 'SLD':
param_grid = hyper_LR
quantifier = EMQ(LogisticRegression())
elif method == 'PACC':
param_grid = hyper_LR
quantifier = PACC(LogisticRegression())
elif method == 'PACC-kfcv':
param_grid = hyper_LR
quantifier = PACC(LogisticRegression(), val_split=10)
elif method == 'PCC':
param_grid = hyper_LR
quantifier = PCC(LogisticRegression())
elif method == 'ACC':
param_grid = hyper_LR
quantifier = ACC(LogisticRegression())
elif method == 'CC':
param_grid = hyper_LR
quantifier = CC(LogisticRegression())
elif method == 'HDy-OvA':
param_grid = {
'binary_quantifier__classifier__C': np.logspace(-4,4,9),
'binary_quantifier__classifier__class_weight': ['balanced', None]
}
quantifier = OneVsAllAggregative(HDy(LogisticRegression()))
elif method == 'DM':
param_grid = {
'nbins': [5,10,15],
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = DistributionMatching(LogisticRegression())
else:
raise NotImplementedError('unknown method', method)
# model selection
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
protocol = UPP(data.test, repeats=n_bags_val)
modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
modsel.fit(data.training)
print(f'best params {modsel.best_params_}')
pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
if is_semeval:
semeval_trained = True
else:
print(f'model selection for {dataset} already done; skipping')
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
report.to_csv(result_path+'.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path+'.csv', sep='\t')

View File

@ -30,7 +30,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
TARGET = ['min_divergence', 'max_likelihood']
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='HD',
bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None):
bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0):
assert bandwidth in KDEy.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
@ -42,6 +42,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
self.engine = engine
self.target = target
self.n_jobs = n_jobs
self.random_state=random_state
def search_bandwidth_maxlikelihood(self, posteriors, labels):
grid = {'bandwidth': np.linspace(0.001, 0.2, 100)}
@ -84,14 +85,20 @@ class KDEy(AggregativeProbabilisticQuantifier):
kde = scipy.stats.gaussian_kde(posteriors)
kde.set_bandwidth(self.bandwidth)
elif self.engine == 'sklearn':
#print('fitting kde')
kde = KernelDensity(bandwidth=self.bandwidth).fit(posteriors)
#print('[fitting done]')
return kde
def pdf(self, kde, posteriors):
if self.engine == 'scipy':
return kde(posteriors[:, :-1].T)
elif self.engine == 'sklearn':
return np.exp(kde.score_samples(posteriors))
#print('pdf...')
densities = np.exp(kde.score_samples(posteriors))
#print('[pdf done]')
return densities
#return np.exp(kde.score_samples(posteriors))
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
@ -118,13 +125,13 @@ class KDEy(AggregativeProbabilisticQuantifier):
return self
def val_pdf(self, prev):
#def val_pdf(self, prev):
"""
Returns a function that computes the mixture model with the given prev as mixture factor
:param prev: a prevalence vector, ndarray
:return: a function implementing the validation distribution with fixed mixture factor
"""
return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
# return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
def aggregate(self, posteriors: np.ndarray):
if self.target == 'min_divergence':
@ -134,14 +141,9 @@ class KDEy(AggregativeProbabilisticQuantifier):
else:
raise ValueError('unknown target')
def _target_divergence(self, posteriors):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
:param instances: instances in the sample
:return: a vector of class prevalence estimates
"""
def _target_divergence_depr(self, posteriors):
# this variant is, I think, ill-formed, since it evaluates the likelihood on the test points, which are
# overconfident in the KDE-test.
test_density = self.get_kde(posteriors)
# val_test_posteriors = np.concatenate([self.val_posteriors, posteriors])
test_likelihood = self.pdf(test_density, posteriors)
@ -164,6 +166,31 @@ class KDEy(AggregativeProbabilisticQuantifier):
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def _target_divergence(self, posteriors, montecarlo_samples=5000):
# in this variant we evaluate the divergence using a Montecarlo approach
n_classes = len(self.val_densities)
samples = qp.functional.uniform_prevalence_sampling(n_classes, size=montecarlo_samples)
test_kde = self.get_kde(posteriors)
test_likelihood = self.pdf(test_kde, samples)
divergence = _get_divergence(self.divergence)
sample_densities = [self.pdf(kde_i, samples) for kde_i in self.val_densities]
def match(prev):
val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, sample_densities))
return divergence(val_likelihood, test_likelihood)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def _target_likelihood(self, posteriors, eps=0.000001):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
@ -172,13 +199,20 @@ class KDEy(AggregativeProbabilisticQuantifier):
:param instances: instances in the sample
:return: a vector of class prevalence estimates
"""
np.random.RandomState(self.random_state)
n_classes = len(self.val_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.val_densities]
#return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
def neg_loglikelihood(prev):
val_pdf = self.val_pdf(prev)
test_likelihood = val_pdf(posteriors)
test_loglikelihood = np.log(test_likelihood + eps)
return -np.sum(test_loglikelihood)
#print('-neg_likelihood')
#val_pdf = self.val_pdf(prev)
#test_likelihood = val_pdf(posteriors)
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + eps)
neg_log_likelihood = -np.sum(test_loglikelihood)
#print('-neg_likelihood [done!]')
return neg_log_likelihood
#return -np.prod(test_likelihood)
# the initial point is set as the uniform distribution
@ -187,5 +221,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
#print('searching for alpha')
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
#print('[optimization ended]')
return r.x

View File

@ -2,18 +2,18 @@ import sys
from pathlib import Path
import pandas as pd
#result_dir = 'results_tweet_1000'
result_dir = 'results_lequa'
result_dir = 'results_tweet_1000'
#result_dir = 'results_lequa'
dfs = []
pathlist = Path(result_dir).rglob('*.csv')
for path in pathlist:
path_in_str = str(path)
print(path_in_str)
try:
df = pd.read_csv(path_in_str, sep='\t')
df = df[df.iloc[:, 0] != df.columns[0]]
if not df.empty:
dfs.append(df)
except Exception:
@ -21,7 +21,7 @@ for path in pathlist:
df = pd.concat(dfs)
for err in ['MAE', 'MRAE']:
for err in ['MAE', 'MRAE', 'KLD']:
print('-'*100)
print(err)
print('-'*100)

View File

@ -4,23 +4,28 @@ y el otro es un KDE en test), de las que luego se calculará la divergencia (obj
generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo
a maximizar.
- quedarse con hyperparametros mejores por verlos
- sacar los dataframes en resultados para hcer test estadisticos
- echar un ojo a los hyperparametros
- hacer dibujitos
- estudiar el caso en que el target es minimizar una divergencia. Posibilidades:
- evaluar los puntos de test solo
- evaluar un APP sobre el simplexo?
- evaluar un UPP sobre el simplexo? (=Montecarlo)
- qué divergencias? HD, topsoe, L1?
1) aclarar: only test?
2) implementar el auto
- optimización interna para likelihood [ninguno parece funcionar bien]
- de todo (e.g., todo el training)?
- independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
- optimización como un parámetro GridSearchQ
3) aclarar: topsoe?
4) otro tipo de model selection?
5) aumentar numero de bags
6) optimizar parametro C? optimizar kernel? optimizar distancia?
6) optimizar kernel? optimizar distancia?
7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría
eliminar el clasificador?):
https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional
8) quitar la ultima dimension en sklearn también?
9) optimizar para RAE en vez de AE?
8) quitar la ultima dimension en sklearn también? No veo porqué
9) optimizar para RAE en vez de AE? No va bien...
10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida
por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con
EMD y meterlo en KDEy
11) KDEx?
12) Dirichlet (el método DIR) habría que arreglarlo y mostrar resultados...
13) Test estadisticos.

View File

@ -123,7 +123,7 @@ class LabelledCollection:
return self.uniform_sampling_index(size, random_state=random_state)
if len(prevs) == self.n_classes - 1:
prevs = prevs + (1 - sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
assert len(prevs) == self.n_classes, f'unexpected number of prevalences (found {len(prevs)}, expected {self.n_classes})'
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
# Decide how many instances should be taken for each class in order to satisfy the requested prevalence