forked from moreo/QuaPy
understanding montecarlo sampling
This commit is contained in:
parent
56dbe744df
commit
32d6aa58f6
|
@ -16,68 +16,68 @@ if __name__ == '__main__':
|
||||||
qp.environ['N_JOBS'] = -1
|
qp.environ['N_JOBS'] = -1
|
||||||
n_bags_val = 250
|
n_bags_val = 250
|
||||||
n_bags_test = 1000
|
n_bags_test = 1000
|
||||||
optim = 'mae'
|
for optim in ['mae', 'mrae']:
|
||||||
result_dir = f'results/binary/{optim}'
|
result_dir = f'results/binary/{optim}'
|
||||||
|
|
||||||
os.makedirs(result_dir, exist_ok=True)
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
for method in BIN_METHODS:
|
for method in BIN_METHODS:
|
||||||
|
|
||||||
print('Init method', method)
|
print('Init method', method)
|
||||||
|
|
||||||
global_result_path = f'{result_dir}/{method}'
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
if not os.path.exists(global_result_path + '.csv'):
|
if not os.path.exists(global_result_path + '.csv'):
|
||||||
with open(global_result_path + '.csv', 'wt') as csv:
|
with open(global_result_path + '.csv', 'wt') as csv:
|
||||||
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
with open(global_result_path + '.csv', 'at') as csv:
|
with open(global_result_path + '.csv', 'at') as csv:
|
||||||
|
|
||||||
for dataset in qp.datasets.UCI_DATASETS:
|
for dataset in qp.datasets.UCI_DATASETS:
|
||||||
if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue
|
if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue
|
||||||
|
|
||||||
print('init', dataset)
|
print('init', dataset)
|
||||||
|
|
||||||
local_result_path = global_result_path + '_' + dataset
|
local_result_path = global_result_path + '_' + dataset
|
||||||
if os.path.exists(local_result_path + '.dataframe'):
|
if os.path.exists(local_result_path + '.dataframe'):
|
||||||
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with qp.util.temp_seed(SEED):
|
with qp.util.temp_seed(SEED):
|
||||||
|
|
||||||
param_grid, quantifier = new_method(method, max_iter=3000)
|
param_grid, quantifier = new_method(method, max_iter=3000)
|
||||||
|
|
||||||
data = qp.datasets.fetch_UCIDataset(dataset)
|
data = qp.datasets.fetch_UCIDataset(dataset)
|
||||||
|
|
||||||
# model selection
|
# model selection
|
||||||
train, test = data.train_test
|
train, test = data.train_test
|
||||||
train, val = train.split_stratified()
|
train, val = train.split_stratified()
|
||||||
|
|
||||||
protocol = UPP(val, repeats=n_bags_val)
|
protocol = UPP(val, repeats=n_bags_val)
|
||||||
modsel = GridSearchQ(
|
modsel = GridSearchQ(
|
||||||
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
|
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
modsel.fit(train)
|
modsel.fit(train)
|
||||||
|
|
||||||
print(f'best params {modsel.best_params_}')
|
print(f'best params {modsel.best_params_}')
|
||||||
print(f'best score {modsel.best_score_}')
|
print(f'best score {modsel.best_score_}')
|
||||||
pickle.dump(
|
pickle.dump(
|
||||||
(modsel.best_params_, modsel.best_score_,),
|
(modsel.best_params_, modsel.best_score_,),
|
||||||
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
quantifier = modsel.best_model()
|
quantifier = modsel.best_model()
|
||||||
except:
|
except:
|
||||||
print('something went wrong... reporting CC')
|
print('something went wrong... reporting CC')
|
||||||
quantifier = qp.method.aggregative.CC(LR()).fit(train)
|
quantifier = qp.method.aggregative.CC(LR()).fit(train)
|
||||||
|
|
||||||
protocol = UPP(test, repeats=n_bags_test)
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
verbose=True)
|
verbose=True)
|
||||||
report.to_csv(f'{local_result_path}.dataframe')
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
means = report.mean()
|
means = report.mean()
|
||||||
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
csv.flush()
|
csv.flush()
|
||||||
|
|
||||||
show_results(global_result_path)
|
show_results(global_result_path)
|
||||||
|
|
|
@ -6,8 +6,8 @@ from distribution_matching.method_dirichlety import DIRy
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
|
||||||
METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML']
|
METHODS = ['KDEy-DMjs', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD']
|
||||||
BIN_METHODS = ['ACC', 'PACC', 'HDy', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML']
|
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
||||||
|
|
||||||
|
|
||||||
hyper_LR = {
|
hyper_LR = {
|
||||||
|
@ -57,10 +57,30 @@ def new_method(method, **lr_kwargs):
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = DistributionMatching(lr)
|
quantifier = DistributionMatching(lr)
|
||||||
|
|
||||||
elif method in ['KDE-DMkld']:
|
# experimental
|
||||||
|
elif method in ['KDEy-DMkld']:
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10)
|
quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10)
|
||||||
|
elif method in ['KDEy-DMhd']:
|
||||||
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
|
||||||
|
elif method in ['KDEy-DMhd2']:
|
||||||
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = KDEy(lr, target='min_divergence_uniform', divergence='HD', montecarlo_trials=5000, val_split=10)
|
||||||
|
elif method in ['KDEy-DMjs']:
|
||||||
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = KDEy(lr, target='min_divergence_uniform', divergence='JS', montecarlo_trials=5000, val_split=10)
|
||||||
|
elif method == 'DM-HD':
|
||||||
|
method_params = {
|
||||||
|
'nbins': [4,8,16,32],
|
||||||
|
'val_split': [10, 0.4],
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr, divergence='HD')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError('unknown method', method)
|
raise NotImplementedError('unknown method', method)
|
||||||
|
|
|
@ -13,48 +13,49 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
|
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
|
||||||
qp.environ['N_JOBS'] = -1
|
qp.environ['N_JOBS'] = -1
|
||||||
optim = 'mrae'
|
for optim in ['mae', 'mrae']:
|
||||||
result_dir = f'results/lequa/{optim}'
|
|
||||||
|
|
||||||
os.makedirs(result_dir, exist_ok=True)
|
result_dir = f'results/lequa/{optim}'
|
||||||
|
|
||||||
for method in METHODS:
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
print('Init method', method)
|
|
||||||
|
|
||||||
result_path = f'{result_dir}/{method}'
|
for method in METHODS:
|
||||||
|
|
||||||
if os.path.exists(result_path+'.csv'):
|
|
||||||
print(f'file {result_path}.csv already exist; skipping')
|
|
||||||
continue
|
|
||||||
|
|
||||||
with open(result_path+'.csv', 'wt') as csv:
|
print('Init method', method)
|
||||||
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
|
||||||
|
|
||||||
dataset = 'T1B'
|
result_path = f'{result_dir}/{method}'
|
||||||
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
|
|
||||||
print(f'init {dataset} #instances: {len(train)}')
|
|
||||||
param_grid, quantifier = new_method(method)
|
|
||||||
|
|
||||||
if param_grid is not None:
|
if os.path.exists(result_path+'.csv'):
|
||||||
modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
|
print(f'file {result_path}.csv already exist; skipping')
|
||||||
|
continue
|
||||||
|
|
||||||
modsel.fit(train)
|
with open(result_path+'.csv', 'wt') as csv:
|
||||||
print(f'best params {modsel.best_params_}')
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
print(f'best score {modsel.best_score_}')
|
|
||||||
pickle.dump(
|
|
||||||
(modsel.best_params_, modsel.best_score_,),
|
|
||||||
open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
quantifier = modsel.best_model()
|
dataset = 'T1B'
|
||||||
else:
|
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
|
||||||
print('debug mode... skipping model selection')
|
print(f'init {dataset} #instances: {len(train)}')
|
||||||
quantifier.fit(train)
|
param_grid, quantifier = new_method(method)
|
||||||
|
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
if param_grid is not None:
|
||||||
means = report.mean()
|
modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
|
||||||
report.to_csv(result_path+'.dataframe')
|
|
||||||
csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
|
||||||
csv.flush()
|
|
||||||
|
|
||||||
show_results(result_path)
|
modsel.fit(train)
|
||||||
|
print(f'best params {modsel.best_params_}')
|
||||||
|
print(f'best score {modsel.best_score_}')
|
||||||
|
pickle.dump(
|
||||||
|
(modsel.best_params_, modsel.best_score_,),
|
||||||
|
open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
quantifier = modsel.best_model()
|
||||||
|
else:
|
||||||
|
print('debug mode... skipping model selection')
|
||||||
|
quantifier.fit(train)
|
||||||
|
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
||||||
|
means = report.mean()
|
||||||
|
report.to_csv(result_path+'.dataframe')
|
||||||
|
csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(result_path)
|
||||||
|
|
|
@ -27,7 +27,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
|
BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
|
||||||
ENGINE = ['scipy', 'sklearn', 'statsmodels']
|
ENGINE = ['scipy', 'sklearn', 'statsmodels']
|
||||||
TARGET = ['min_divergence', 'max_likelihood']
|
TARGET = ['min_divergence', 'min_divergence_uniform', 'max_likelihood']
|
||||||
|
|
||||||
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2',
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2',
|
||||||
bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000):
|
bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000):
|
||||||
|
@ -35,7 +35,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
|
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
|
||||||
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
|
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
|
||||||
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
|
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
|
||||||
assert divergence=='KLD', 'in this version I will only allow KLD as a divergence'
|
assert divergence in ['KLD', 'HD', 'JS'], 'in this version I will only allow KLD or squared HD as a divergence'
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
self.val_split = val_split
|
self.val_split = val_split
|
||||||
self.divergence = divergence
|
self.divergence = divergence
|
||||||
|
@ -118,7 +118,6 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
)
|
)
|
||||||
print('classifier fit done')
|
|
||||||
|
|
||||||
if self.bandwidth == 'auto':
|
if self.bandwidth == 'auto':
|
||||||
self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
|
self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
|
||||||
|
@ -126,21 +125,22 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)]
|
self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)]
|
||||||
self.val_posteriors = posteriors
|
self.val_posteriors = posteriors
|
||||||
|
|
||||||
if self.target == 'min_divergence_depr':
|
if self.target == 'min_divergence_uniform':
|
||||||
self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials)
|
self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials)
|
||||||
self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities]
|
self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities]
|
||||||
if self.target == 'min_divergence':
|
elif self.target == 'min_divergence':
|
||||||
self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities]
|
self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities]
|
||||||
self.class_sample_densities = {}
|
self.class_sample_densities = {}
|
||||||
for ci, samples_i in enumerate(self.class_samples):
|
for ci, samples_i in enumerate(self.class_samples):
|
||||||
self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T
|
self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T
|
||||||
|
|
||||||
print('kde fit done')
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, posteriors: np.ndarray):
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
if self.target == 'min_divergence':
|
if self.target == 'min_divergence':
|
||||||
return self._target_divergence(posteriors)
|
return self._target_divergence(posteriors)
|
||||||
|
elif self.target == 'min_divergence_uniform':
|
||||||
|
return self._target_divergence_uniform(posteriors)
|
||||||
elif self.target == 'max_likelihood':
|
elif self.target == 'max_likelihood':
|
||||||
return self._target_likelihood(posteriors)
|
return self._target_likelihood(posteriors)
|
||||||
else:
|
else:
|
||||||
|
@ -170,6 +170,42 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
# r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
# r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
# return r.x
|
# return r.x
|
||||||
|
|
||||||
|
def _target_divergence_uniform(self, posteriors):
|
||||||
|
# in this variant we evaluate the divergence using a Montecarlo approach
|
||||||
|
n_classes = len(self.val_densities)
|
||||||
|
|
||||||
|
test_kde = self.get_kde_function(posteriors)
|
||||||
|
test_likelihood = self.pdf(test_kde, self.samples)
|
||||||
|
|
||||||
|
def f_squared_hellinger(t):
|
||||||
|
return (np.sqrt(t) - 1)**2
|
||||||
|
|
||||||
|
def f_jensen_shannon(t):
|
||||||
|
return -(t+1)*np.log((t+1)/2) + t*np.log(t)
|
||||||
|
|
||||||
|
def fdivergence(pi, qi, f, eps=1e-10):
|
||||||
|
spi = pi+eps
|
||||||
|
sqi = qi+eps
|
||||||
|
return np.mean(f(spi/sqi)*sqi)
|
||||||
|
|
||||||
|
if self.divergence.lower() == 'hd':
|
||||||
|
f = f_squared_hellinger
|
||||||
|
elif self.divergence.lower() == 'js':
|
||||||
|
f = f_jensen_shannon
|
||||||
|
|
||||||
|
def match(prev):
|
||||||
|
val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
|
||||||
|
return fdivergence(val_likelihood, test_likelihood, f)
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
||||||
def _target_divergence(self, posteriors):
|
def _target_divergence(self, posteriors):
|
||||||
# in this variant we evaluate the divergence using a Montecarlo approach
|
# in this variant we evaluate the divergence using a Montecarlo approach
|
||||||
n_classes = len(self.val_densities)
|
n_classes = len(self.val_densities)
|
||||||
|
@ -184,6 +220,18 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
smooth_qi = qi+eps
|
smooth_qi = qi+eps
|
||||||
return np.mean(np.log(smooth_pi / smooth_qi))
|
return np.mean(np.log(smooth_pi / smooth_qi))
|
||||||
|
|
||||||
|
def squared_hellinger(pi, qi, eps=1e-8):
|
||||||
|
smooth_pi = pi + eps
|
||||||
|
smooth_qi = qi + eps
|
||||||
|
return np.mean((np.sqrt(smooth_pi/smooth_qi)-1)**2)
|
||||||
|
|
||||||
|
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
|
||||||
|
if self.divergence.lower() == 'kld':
|
||||||
|
fdivergence = kld_monte
|
||||||
|
elif self.divergence.lower() == 'hd':
|
||||||
|
fdivergence = squared_hellinger
|
||||||
|
|
||||||
|
|
||||||
def match(prev):
|
def match(prev):
|
||||||
# choose the samples according to the prevalence vector
|
# choose the samples according to the prevalence vector
|
||||||
# e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2
|
# e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2
|
||||||
|
@ -202,7 +250,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
test_likelihood = np.concatenate(
|
test_likelihood = np.concatenate(
|
||||||
[samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
|
[samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
|
||||||
)
|
)
|
||||||
return kld_monte(val_likelihood, test_likelihood)
|
return fdivergence(val_likelihood, test_likelihood)
|
||||||
|
|
||||||
# the initial point is set as the uniform distribution
|
# the initial point is set as the uniform distribution
|
||||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
@ -246,4 +294,5 @@ class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
#print('searching for alpha')
|
#print('searching for alpha')
|
||||||
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
#print('[optimization ended]')
|
#print('[optimization ended]')
|
||||||
return r.x
|
return r.x
|
||||||
|
|
||||||
|
|
|
@ -1,49 +1,21 @@
|
||||||
Cosa fundamental:
|
1.- No se si sería más facil tomar r=uniforme, y no r=mixture model, simplifica mucho el sampling y tal vez incluso produzca menos error
|
||||||
KDE se puede usar para generar 2 distribuciones (una, es un mixture model de KDEs en train condicionados a cada clase,
|
2.- Por ahora tengo KLD y HD:
|
||||||
y el otro es un KDE en test), de las que luego se calculará la divergencia (objetivo a minimizar). Otra opción es
|
- para KLD no he entendido si tengo que añadir el -x + y
|
||||||
generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo
|
3.- Se puede poner la topsoe como una f-divergence?
|
||||||
a maximizar.
|
La topsoe parece que es 2 veces la jensen-shannon divergence, o sea
|
||||||
|
topsoe(p,q) = kld(p|m) + kld(q|m), con m = (p+q)/2
|
||||||
|
4.- Se puede poner la Wasserstein como una f-divergence?
|
||||||
|
5.- En general, qué relación hay con las "distancias"?
|
||||||
|
|
||||||
- echar un ojo a los hyperparametros
|
|
||||||
- hacer dibujitos
|
|
||||||
- estudiar el caso en que el target es minimizar una divergencia. Posibilidades:
|
|
||||||
- evaluar los puntos de test solo
|
|
||||||
- evaluar un APP sobre el simplexo?
|
|
||||||
- evaluar un UPP sobre el simplexo? (=Montecarlo)
|
|
||||||
- qué divergencias? HD, topsoe, L1?
|
|
||||||
- tampoco estoy evaluando en modo kfcv creo...
|
|
||||||
|
|
||||||
1) sacar lequa-kfcv y todos los kfcv que puedan tener sentido en tweets
|
|
||||||
2) implementar el auto
|
2) implementar el auto
|
||||||
- optimización interna para likelihood [ninguno parece funcionar bien]
|
- optimización interna para likelihood [ninguno parece funcionar bien]
|
||||||
- de todo (e.g., todo el training)?
|
- de todo (e.g., todo el training)?
|
||||||
- independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
|
- independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
|
||||||
- optimización como un parámetro GridSearchQ
|
- optimización como un parámetro GridSearchQ
|
||||||
6) optimizar kernel? optimizar distancia?
|
6) optimizar kernel? optimizar distancia?
|
||||||
7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría
|
|
||||||
eliminar el clasificador?):
|
|
||||||
https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional
|
|
||||||
8) quitar la ultima dimension en sklearn también? No veo porqué
|
|
||||||
9) optimizar para RAE en vez de AE? No va bien...
|
|
||||||
10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida
|
10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida
|
||||||
por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con
|
por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con
|
||||||
EMD y meterlo en KDEy
|
EMD y meterlo en KDEy
|
||||||
11) KDEx?
|
11) KDEx?
|
||||||
12) Dirichlet (el método DIR) habría que arreglarlo y mostrar resultados...
|
|
||||||
13) Test estadisticos.
|
|
||||||
|
|
||||||
Notas:
|
|
||||||
estoy probando a reemplazar el target max_likelihood con un min_divergence:
|
|
||||||
- como la divergencia entre dos KDEs ahora es en el espacio continuo, no es facil como obtener. Estoy probando
|
|
||||||
con una evaluación en test, pero el problema es que es overconfident con respecto a la que ha sido obtenida en test.
|
|
||||||
Otra opción es un MonteCarlo que es lo que estoy probando ahora. Para este experimento he quitado la model selection
|
|
||||||
del clasificador, y estoy dejando solo la que hace con el bandwidth por agilizarlo. Los resultados KDE-nomonte son un
|
|
||||||
max_likelihood en igualdad de condiciones (solo bandwidth), KDE-monte1 es un montecarlo con HD a 1000 puntos, y KDE-monte2
|
|
||||||
es lo mismo pero con 5000 puntos; ambos funcionan mal. KDE-monte1 y KDE-monte2 los voy a borrar.
|
|
||||||
Ahora estoy probando con KDE-monte3, lo mismo pero con una L2 como
|
|
||||||
divergencia. Parece mucho más parecido a KDE-nomonte (pero sigue siendo algo peor)
|
|
||||||
- probar con más puntos (KDE-monte4 es a 5000 puntos)
|
|
||||||
- habría que probar con topsoe (KDE-monte5)
|
|
||||||
- probar con optimización del LR (KDE-monte6 y con kfcv)
|
|
||||||
- probar con L1 en vez de L2 (KDE-monte7 con 5000 puntos y sin LR)
|
|
||||||
- tal vez habría que probar con la L2, que funciona bien, en el min_divergence que evaluaba en test, o test+train
|
|
|
@ -15,70 +15,71 @@ if __name__ == '__main__':
|
||||||
qp.environ['N_JOBS'] = -1
|
qp.environ['N_JOBS'] = -1
|
||||||
n_bags_val = 250
|
n_bags_val = 250
|
||||||
n_bags_test = 1000
|
n_bags_test = 1000
|
||||||
optim = 'mae'
|
for optim in ['mae', 'mrae']:
|
||||||
result_dir = f'results/tweet/{optim}'
|
|
||||||
|
|
||||||
os.makedirs(result_dir, exist_ok=True)
|
result_dir = f'results/tweet/{optim}'
|
||||||
|
|
||||||
for method in METHODS:
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
print('Init method', method)
|
|
||||||
|
|
||||||
global_result_path = f'{result_dir}/{method}'
|
for method in METHODS:
|
||||||
|
|
||||||
if not os.path.exists(global_result_path+'.csv'):
|
|
||||||
with open(global_result_path+'.csv', 'wt') as csv:
|
|
||||||
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
|
||||||
|
|
||||||
with open(global_result_path+'.csv', 'at') as csv:
|
print('Init method', method)
|
||||||
# four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
|
|
||||||
# this variable controls that the mod sel has already been done, and skip this otherwise
|
|
||||||
semeval_trained = False
|
|
||||||
|
|
||||||
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
global_result_path = f'{result_dir}/{method}'
|
||||||
print('init', dataset)
|
|
||||||
|
|
||||||
local_result_path = global_result_path + '_' + dataset
|
if not os.path.exists(global_result_path+'.csv'):
|
||||||
if os.path.exists(local_result_path+'.dataframe'):
|
with open(global_result_path+'.csv', 'wt') as csv:
|
||||||
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
continue
|
|
||||||
|
|
||||||
with qp.util.temp_seed(SEED):
|
|
||||||
|
|
||||||
is_semeval = dataset.startswith('semeval')
|
with open(global_result_path+'.csv', 'at') as csv:
|
||||||
|
# four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
|
||||||
|
# this variable controls that the mod sel has already been done, and skip this otherwise
|
||||||
|
semeval_trained = False
|
||||||
|
|
||||||
if not is_semeval or not semeval_trained:
|
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
||||||
|
print('init', dataset)
|
||||||
|
|
||||||
param_grid, quantifier = new_method(method)
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
if os.path.exists(local_result_path+'.dataframe'):
|
||||||
|
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
||||||
|
continue
|
||||||
|
|
||||||
# model selection
|
with qp.util.temp_seed(SEED):
|
||||||
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
|
|
||||||
|
|
||||||
protocol = UPP(data.test, repeats=n_bags_val)
|
is_semeval = dataset.startswith('semeval')
|
||||||
modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
|
|
||||||
|
|
||||||
modsel.fit(data.training)
|
if not is_semeval or not semeval_trained:
|
||||||
print(f'best params {modsel.best_params_}')
|
|
||||||
print(f'best score {modsel.best_score_}')
|
|
||||||
pickle.dump(
|
|
||||||
(modsel.best_params_, modsel.best_score_,),
|
|
||||||
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
quantifier = modsel.best_model()
|
param_grid, quantifier = new_method(method)
|
||||||
|
|
||||||
if is_semeval:
|
# model selection
|
||||||
semeval_trained = True
|
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
|
||||||
|
|
||||||
else:
|
|
||||||
print(f'model selection for {dataset} already done; skipping')
|
|
||||||
|
|
||||||
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
|
protocol = UPP(data.test, repeats=n_bags_val)
|
||||||
quantifier.fit(data.training)
|
modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
|
||||||
protocol = UPP(data.test, repeats=n_bags_test)
|
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
|
||||||
report.to_csv(f'{local_result_path}.dataframe')
|
|
||||||
means = report.mean()
|
|
||||||
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
|
||||||
csv.flush()
|
|
||||||
|
|
||||||
show_results(global_result_path)
|
modsel.fit(data.training)
|
||||||
|
print(f'best params {modsel.best_params_}')
|
||||||
|
print(f'best score {modsel.best_score_}')
|
||||||
|
pickle.dump(
|
||||||
|
(modsel.best_params_, modsel.best_score_,),
|
||||||
|
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
quantifier = modsel.best_model()
|
||||||
|
|
||||||
|
if is_semeval:
|
||||||
|
semeval_trained = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f'model selection for {dataset} already done; skipping')
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
|
||||||
|
quantifier.fit(data.training)
|
||||||
|
protocol = UPP(data.test, repeats=n_bags_test)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
means = report.mean()
|
||||||
|
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
||||||
|
|
Loading…
Reference in New Issue