1
0
Fork 0

understanding montecarlo sampling

This commit is contained in:
Alejandro Moreo Fernandez 2023-10-02 17:50:12 +02:00
parent 56dbe744df
commit 32d6aa58f6
6 changed files with 222 additions and 179 deletions

View File

@ -16,68 +16,68 @@ if __name__ == '__main__':
qp.environ['N_JOBS'] = -1 qp.environ['N_JOBS'] = -1
n_bags_val = 250 n_bags_val = 250
n_bags_test = 1000 n_bags_test = 1000
optim = 'mae' for optim in ['mae', 'mrae']:
result_dir = f'results/binary/{optim}' result_dir = f'results/binary/{optim}'
os.makedirs(result_dir, exist_ok=True) os.makedirs(result_dir, exist_ok=True)
for method in BIN_METHODS: for method in BIN_METHODS:
print('Init method', method) print('Init method', method)
global_result_path = f'{result_dir}/{method}' global_result_path = f'{result_dir}/{method}'
if not os.path.exists(global_result_path + '.csv'): if not os.path.exists(global_result_path + '.csv'):
with open(global_result_path + '.csv', 'wt') as csv: with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
with open(global_result_path + '.csv', 'at') as csv: with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_DATASETS: for dataset in qp.datasets.UCI_DATASETS:
if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue
print('init', dataset) print('init', dataset)
local_result_path = global_result_path + '_' + dataset local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path + '.dataframe'): if os.path.exists(local_result_path + '.dataframe'):
print(f'result file {local_result_path}.dataframe already exist; skipping') print(f'result file {local_result_path}.dataframe already exist; skipping')
continue continue
with qp.util.temp_seed(SEED): with qp.util.temp_seed(SEED):
param_grid, quantifier = new_method(method, max_iter=3000) param_grid, quantifier = new_method(method, max_iter=3000)
data = qp.datasets.fetch_UCIDataset(dataset) data = qp.datasets.fetch_UCIDataset(dataset)
# model selection # model selection
train, test = data.train_test train, test = data.train_test
train, val = train.split_stratified() train, val = train.split_stratified()
protocol = UPP(val, repeats=n_bags_val) protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ( modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
) )
try: try:
modsel.fit(train) modsel.fit(train)
print(f'best params {modsel.best_params_}') print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}') print(f'best score {modsel.best_score_}')
pickle.dump( pickle.dump(
(modsel.best_params_, modsel.best_score_,), (modsel.best_params_, modsel.best_score_,),
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model() quantifier = modsel.best_model()
except: except:
print('something went wrong... reporting CC') print('something went wrong... reporting CC')
quantifier = qp.method.aggregative.CC(LR()).fit(train) quantifier = qp.method.aggregative.CC(LR()).fit(train)
protocol = UPP(test, repeats=n_bags_test) protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
verbose=True) verbose=True)
report.to_csv(f'{local_result_path}.dataframe') report.to_csv(f'{local_result_path}.dataframe')
means = report.mean() means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush() csv.flush()
show_results(global_result_path) show_results(global_result_path)

View File

@ -6,8 +6,8 @@ from distribution_matching.method_dirichlety import DIRy
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] METHODS = ['KDEy-DMjs', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD']
BIN_METHODS = ['ACC', 'PACC', 'HDy', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
hyper_LR = { hyper_LR = {
@ -57,10 +57,30 @@ def new_method(method, **lr_kwargs):
param_grid = {**method_params, **hyper_LR} param_grid = {**method_params, **hyper_LR}
quantifier = DistributionMatching(lr) quantifier = DistributionMatching(lr)
elif method in ['KDE-DMkld']: # experimental
elif method in ['KDEy-DMkld']:
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
param_grid = {**method_params, **hyper_LR} param_grid = {**method_params, **hyper_LR}
quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10) quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10)
elif method in ['KDEy-DMhd']:
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
param_grid = {**method_params, **hyper_LR}
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
elif method in ['KDEy-DMhd2']:
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
param_grid = {**method_params, **hyper_LR}
quantifier = KDEy(lr, target='min_divergence_uniform', divergence='HD', montecarlo_trials=5000, val_split=10)
elif method in ['KDEy-DMjs']:
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
param_grid = {**method_params, **hyper_LR}
quantifier = KDEy(lr, target='min_divergence_uniform', divergence='JS', montecarlo_trials=5000, val_split=10)
elif method == 'DM-HD':
method_params = {
'nbins': [4,8,16,32],
'val_split': [10, 0.4],
}
param_grid = {**method_params, **hyper_LR}
quantifier = DistributionMatching(lr, divergence='HD')
else: else:
raise NotImplementedError('unknown method', method) raise NotImplementedError('unknown method', method)

View File

@ -13,48 +13,49 @@ if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B'] qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
qp.environ['N_JOBS'] = -1 qp.environ['N_JOBS'] = -1
optim = 'mrae' for optim in ['mae', 'mrae']:
result_dir = f'results/lequa/{optim}'
os.makedirs(result_dir, exist_ok=True) result_dir = f'results/lequa/{optim}'
for method in METHODS: os.makedirs(result_dir, exist_ok=True)
print('Init method', method)
result_path = f'{result_dir}/{method}' for method in METHODS:
if os.path.exists(result_path+'.csv'):
print(f'file {result_path}.csv already exist; skipping')
continue
with open(result_path+'.csv', 'wt') as csv: print('Init method', method)
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
dataset = 'T1B' result_path = f'{result_dir}/{method}'
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
print(f'init {dataset} #instances: {len(train)}')
param_grid, quantifier = new_method(method)
if param_grid is not None: if os.path.exists(result_path+'.csv'):
modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim) print(f'file {result_path}.csv already exist; skipping')
continue
modsel.fit(train) with open(result_path+'.csv', 'wt') as csv:
print(f'best params {modsel.best_params_}') csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model() dataset = 'T1B'
else: train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
print('debug mode... skipping model selection') print(f'init {dataset} #instances: {len(train)}')
quantifier.fit(train) param_grid, quantifier = new_method(method)
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True) if param_grid is not None:
means = report.mean() modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
report.to_csv(result_path+'.dataframe')
csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(result_path) modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
else:
print('debug mode... skipping model selection')
quantifier.fit(train)
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
means = report.mean()
report.to_csv(result_path+'.dataframe')
csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(result_path)

View File

@ -27,7 +27,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
BANDWIDTH_METHOD = ['auto', 'scott', 'silverman'] BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
ENGINE = ['scipy', 'sklearn', 'statsmodels'] ENGINE = ['scipy', 'sklearn', 'statsmodels']
TARGET = ['min_divergence', 'max_likelihood'] TARGET = ['min_divergence', 'min_divergence_uniform', 'max_likelihood']
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2', def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2',
bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000): bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000):
@ -35,7 +35,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}' f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}' assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}' assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
assert divergence=='KLD', 'in this version I will only allow KLD as a divergence' assert divergence in ['KLD', 'HD', 'JS'], 'in this version I will only allow KLD or squared HD as a divergence'
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.divergence = divergence self.divergence = divergence
@ -118,7 +118,6 @@ class KDEy(AggregativeProbabilisticQuantifier):
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions( self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
) )
print('classifier fit done')
if self.bandwidth == 'auto': if self.bandwidth == 'auto':
self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y) self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
@ -126,21 +125,22 @@ class KDEy(AggregativeProbabilisticQuantifier):
self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)] self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)]
self.val_posteriors = posteriors self.val_posteriors = posteriors
if self.target == 'min_divergence_depr': if self.target == 'min_divergence_uniform':
self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials) self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials)
self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities] self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities]
if self.target == 'min_divergence': elif self.target == 'min_divergence':
self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities] self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities]
self.class_sample_densities = {} self.class_sample_densities = {}
for ci, samples_i in enumerate(self.class_samples): for ci, samples_i in enumerate(self.class_samples):
self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T
print('kde fit done')
return self return self
def aggregate(self, posteriors: np.ndarray): def aggregate(self, posteriors: np.ndarray):
if self.target == 'min_divergence': if self.target == 'min_divergence':
return self._target_divergence(posteriors) return self._target_divergence(posteriors)
elif self.target == 'min_divergence_uniform':
return self._target_divergence_uniform(posteriors)
elif self.target == 'max_likelihood': elif self.target == 'max_likelihood':
return self._target_likelihood(posteriors) return self._target_likelihood(posteriors)
else: else:
@ -170,6 +170,42 @@ class KDEy(AggregativeProbabilisticQuantifier):
# r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) # r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
# return r.x # return r.x
def _target_divergence_uniform(self, posteriors):
# in this variant we evaluate the divergence using a Montecarlo approach
n_classes = len(self.val_densities)
test_kde = self.get_kde_function(posteriors)
test_likelihood = self.pdf(test_kde, self.samples)
def f_squared_hellinger(t):
return (np.sqrt(t) - 1)**2
def f_jensen_shannon(t):
return -(t+1)*np.log((t+1)/2) + t*np.log(t)
def fdivergence(pi, qi, f, eps=1e-10):
spi = pi+eps
sqi = qi+eps
return np.mean(f(spi/sqi)*sqi)
if self.divergence.lower() == 'hd':
f = f_squared_hellinger
elif self.divergence.lower() == 'js':
f = f_jensen_shannon
def match(prev):
val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
return fdivergence(val_likelihood, test_likelihood, f)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def _target_divergence(self, posteriors): def _target_divergence(self, posteriors):
# in this variant we evaluate the divergence using a Montecarlo approach # in this variant we evaluate the divergence using a Montecarlo approach
n_classes = len(self.val_densities) n_classes = len(self.val_densities)
@ -184,6 +220,18 @@ class KDEy(AggregativeProbabilisticQuantifier):
smooth_qi = qi+eps smooth_qi = qi+eps
return np.mean(np.log(smooth_pi / smooth_qi)) return np.mean(np.log(smooth_pi / smooth_qi))
def squared_hellinger(pi, qi, eps=1e-8):
smooth_pi = pi + eps
smooth_qi = qi + eps
return np.mean((np.sqrt(smooth_pi/smooth_qi)-1)**2)
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'kld':
fdivergence = kld_monte
elif self.divergence.lower() == 'hd':
fdivergence = squared_hellinger
def match(prev): def match(prev):
# choose the samples according to the prevalence vector # choose the samples according to the prevalence vector
# e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2 # e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2
@ -202,7 +250,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
test_likelihood = np.concatenate( test_likelihood = np.concatenate(
[samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)] [samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
) )
return kld_monte(val_likelihood, test_likelihood) return fdivergence(val_likelihood, test_likelihood)
# the initial point is set as the uniform distribution # the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
@ -246,4 +294,5 @@ class KDEy(AggregativeProbabilisticQuantifier):
#print('searching for alpha') #print('searching for alpha')
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
#print('[optimization ended]') #print('[optimization ended]')
return r.x return r.x

View File

@ -1,49 +1,21 @@
Cosa fundamental: 1.- No se si sería más facil tomar r=uniforme, y no r=mixture model, simplifica mucho el sampling y tal vez incluso produzca menos error
KDE se puede usar para generar 2 distribuciones (una, es un mixture model de KDEs en train condicionados a cada clase, 2.- Por ahora tengo KLD y HD:
y el otro es un KDE en test), de las que luego se calculará la divergencia (objetivo a minimizar). Otra opción es - para KLD no he entendido si tengo que añadir el -x + y
generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo 3.- Se puede poner la topsoe como una f-divergence?
a maximizar. La topsoe parece que es 2 veces la jensen-shannon divergence, o sea
topsoe(p,q) = kld(p|m) + kld(q|m), con m = (p+q)/2
4.- Se puede poner la Wasserstein como una f-divergence?
5.- En general, qué relación hay con las "distancias"?
- echar un ojo a los hyperparametros
- hacer dibujitos
- estudiar el caso en que el target es minimizar una divergencia. Posibilidades:
- evaluar los puntos de test solo
- evaluar un APP sobre el simplexo?
- evaluar un UPP sobre el simplexo? (=Montecarlo)
- qué divergencias? HD, topsoe, L1?
- tampoco estoy evaluando en modo kfcv creo...
1) sacar lequa-kfcv y todos los kfcv que puedan tener sentido en tweets
2) implementar el auto 2) implementar el auto
- optimización interna para likelihood [ninguno parece funcionar bien] - optimización interna para likelihood [ninguno parece funcionar bien]
- de todo (e.g., todo el training)? - de todo (e.g., todo el training)?
- independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test) - independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
- optimización como un parámetro GridSearchQ - optimización como un parámetro GridSearchQ
6) optimizar kernel? optimizar distancia? 6) optimizar kernel? optimizar distancia?
7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría
eliminar el clasificador?):
https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional
8) quitar la ultima dimension en sklearn también? No veo porqué
9) optimizar para RAE en vez de AE? No va bien...
10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida 10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida
por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con
EMD y meterlo en KDEy EMD y meterlo en KDEy
11) KDEx? 11) KDEx?
12) Dirichlet (el método DIR) habría que arreglarlo y mostrar resultados...
13) Test estadisticos.
Notas:
estoy probando a reemplazar el target max_likelihood con un min_divergence:
- como la divergencia entre dos KDEs ahora es en el espacio continuo, no es facil como obtener. Estoy probando
con una evaluación en test, pero el problema es que es overconfident con respecto a la que ha sido obtenida en test.
Otra opción es un MonteCarlo que es lo que estoy probando ahora. Para este experimento he quitado la model selection
del clasificador, y estoy dejando solo la que hace con el bandwidth por agilizarlo. Los resultados KDE-nomonte son un
max_likelihood en igualdad de condiciones (solo bandwidth), KDE-monte1 es un montecarlo con HD a 1000 puntos, y KDE-monte2
es lo mismo pero con 5000 puntos; ambos funcionan mal. KDE-monte1 y KDE-monte2 los voy a borrar.
Ahora estoy probando con KDE-monte3, lo mismo pero con una L2 como
divergencia. Parece mucho más parecido a KDE-nomonte (pero sigue siendo algo peor)
- probar con más puntos (KDE-monte4 es a 5000 puntos)
- habría que probar con topsoe (KDE-monte5)
- probar con optimización del LR (KDE-monte6 y con kfcv)
- probar con L1 en vez de L2 (KDE-monte7 con 5000 puntos y sin LR)
- tal vez habría que probar con la L2, que funciona bien, en el min_divergence que evaluaba en test, o test+train

View File

@ -15,70 +15,71 @@ if __name__ == '__main__':
qp.environ['N_JOBS'] = -1 qp.environ['N_JOBS'] = -1
n_bags_val = 250 n_bags_val = 250
n_bags_test = 1000 n_bags_test = 1000
optim = 'mae' for optim in ['mae', 'mrae']:
result_dir = f'results/tweet/{optim}'
os.makedirs(result_dir, exist_ok=True) result_dir = f'results/tweet/{optim}'
for method in METHODS: os.makedirs(result_dir, exist_ok=True)
print('Init method', method)
global_result_path = f'{result_dir}/{method}' for method in METHODS:
if not os.path.exists(global_result_path+'.csv'):
with open(global_result_path+'.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
with open(global_result_path+'.csv', 'at') as csv: print('Init method', method)
# four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
# this variable controls that the mod sel has already been done, and skip this otherwise
semeval_trained = False
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: global_result_path = f'{result_dir}/{method}'
print('init', dataset)
local_result_path = global_result_path + '_' + dataset if not os.path.exists(global_result_path+'.csv'):
if os.path.exists(local_result_path+'.dataframe'): with open(global_result_path+'.csv', 'wt') as csv:
print(f'result file {local_result_path}.dataframe already exist; skipping') csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
continue
with qp.util.temp_seed(SEED):
is_semeval = dataset.startswith('semeval') with open(global_result_path+'.csv', 'at') as csv:
# four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
# this variable controls that the mod sel has already been done, and skip this otherwise
semeval_trained = False
if not is_semeval or not semeval_trained: for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
print('init', dataset)
param_grid, quantifier = new_method(method) local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path+'.dataframe'):
print(f'result file {local_result_path}.dataframe already exist; skipping')
continue
# model selection with qp.util.temp_seed(SEED):
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
protocol = UPP(data.test, repeats=n_bags_val) is_semeval = dataset.startswith('semeval')
modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
modsel.fit(data.training) if not is_semeval or not semeval_trained:
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model() param_grid, quantifier = new_method(method)
if is_semeval: # model selection
semeval_trained = True data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
else:
print(f'model selection for {dataset} already done; skipping')
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) protocol = UPP(data.test, repeats=n_bags_val)
quantifier.fit(data.training) modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
protocol = UPP(data.test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
report.to_csv(f'{local_result_path}.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(global_result_path) modsel.fit(data.training)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
if is_semeval:
semeval_trained = True
else:
print(f'model selection for {dataset} already done; skipping')
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
report.to_csv(f'{local_result_path}.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(global_result_path)