diff --git a/distribution_matching/binary_experiments.py b/distribution_matching/binary_experiments.py index 2a17027..a8922ad 100644 --- a/distribution_matching/binary_experiments.py +++ b/distribution_matching/binary_experiments.py @@ -16,68 +16,68 @@ if __name__ == '__main__': qp.environ['N_JOBS'] = -1 n_bags_val = 250 n_bags_test = 1000 - optim = 'mae' - result_dir = f'results/binary/{optim}' + for optim in ['mae', 'mrae']: + result_dir = f'results/binary/{optim}' - os.makedirs(result_dir, exist_ok=True) + os.makedirs(result_dir, exist_ok=True) - for method in BIN_METHODS: + for method in BIN_METHODS: - print('Init method', method) + print('Init method', method) - global_result_path = f'{result_dir}/{method}' + global_result_path = f'{result_dir}/{method}' - if not os.path.exists(global_result_path + '.csv'): - with open(global_result_path + '.csv', 'wt') as csv: - csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') + if not os.path.exists(global_result_path + '.csv'): + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') - with open(global_result_path + '.csv', 'at') as csv: + with open(global_result_path + '.csv', 'at') as csv: - for dataset in qp.datasets.UCI_DATASETS: - if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue + for dataset in qp.datasets.UCI_DATASETS: + if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue - print('init', dataset) + print('init', dataset) - local_result_path = global_result_path + '_' + dataset - if os.path.exists(local_result_path + '.dataframe'): - print(f'result file {local_result_path}.dataframe already exist; skipping') - continue + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path + '.dataframe'): + print(f'result file {local_result_path}.dataframe already exist; skipping') + continue - with qp.util.temp_seed(SEED): + with qp.util.temp_seed(SEED): - param_grid, quantifier = new_method(method, max_iter=3000) + param_grid, quantifier = new_method(method, max_iter=3000) - data = qp.datasets.fetch_UCIDataset(dataset) + data = qp.datasets.fetch_UCIDataset(dataset) - # model selection - train, test = data.train_test - train, val = train.split_stratified() + # model selection + train, test = data.train_test + train, val = train.split_stratified() - protocol = UPP(val, repeats=n_bags_val) - modsel = GridSearchQ( - quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim - ) + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim + ) - try: - modsel.fit(train) + try: + modsel.fit(train) - print(f'best params {modsel.best_params_}') - print(f'best score {modsel.best_score_}') - pickle.dump( - (modsel.best_params_, modsel.best_score_,), - open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + pickle.dump( + (modsel.best_params_, modsel.best_score_,), + open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) - quantifier = modsel.best_model() - except: - print('something went wrong... reporting CC') - quantifier = qp.method.aggregative.CC(LR()).fit(train) + quantifier = modsel.best_model() + except: + print('something went wrong... reporting CC') + quantifier = qp.method.aggregative.CC(LR()).fit(train) - protocol = UPP(test, repeats=n_bags_test) - report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], - verbose=True) - report.to_csv(f'{local_result_path}.dataframe') - means = report.mean() - csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') - csv.flush() + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], + verbose=True) + report.to_csv(f'{local_result_path}.dataframe') + means = report.mean() + csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() - show_results(global_result_path) + show_results(global_result_path) diff --git a/distribution_matching/commons.py b/distribution_matching/commons.py index d2ccd40..2b485ec 100644 --- a/distribution_matching/commons.py +++ b/distribution_matching/commons.py @@ -6,8 +6,8 @@ from distribution_matching.method_dirichlety import DIRy from sklearn.linear_model import LogisticRegression -METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] -BIN_METHODS = ['ACC', 'PACC', 'HDy', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] +METHODS = ['KDEy-DMjs', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DM', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] +BIN_METHODS = [x.replace('-OvA', '') for x in METHODS] hyper_LR = { @@ -57,10 +57,30 @@ def new_method(method, **lr_kwargs): param_grid = {**method_params, **hyper_LR} quantifier = DistributionMatching(lr) - elif method in ['KDE-DMkld']: + # experimental + elif method in ['KDEy-DMkld']: method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} param_grid = {**method_params, **hyper_LR} quantifier = KDEy(lr, target='min_divergence', divergence='KLD', montecarlo_trials=5000, val_split=10) + elif method in ['KDEy-DMhd']: + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} + param_grid = {**method_params, **hyper_LR} + quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10) + elif method in ['KDEy-DMhd2']: + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} + param_grid = {**method_params, **hyper_LR} + quantifier = KDEy(lr, target='min_divergence_uniform', divergence='HD', montecarlo_trials=5000, val_split=10) + elif method in ['KDEy-DMjs']: + method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)} + param_grid = {**method_params, **hyper_LR} + quantifier = KDEy(lr, target='min_divergence_uniform', divergence='JS', montecarlo_trials=5000, val_split=10) + elif method == 'DM-HD': + method_params = { + 'nbins': [4,8,16,32], + 'val_split': [10, 0.4], + } + param_grid = {**method_params, **hyper_LR} + quantifier = DistributionMatching(lr, divergence='HD') else: raise NotImplementedError('unknown method', method) diff --git a/distribution_matching/lequa_experiments.py b/distribution_matching/lequa_experiments.py index 24c1125..27dc921 100644 --- a/distribution_matching/lequa_experiments.py +++ b/distribution_matching/lequa_experiments.py @@ -13,48 +13,49 @@ if __name__ == '__main__': qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B'] qp.environ['N_JOBS'] = -1 - optim = 'mrae' - result_dir = f'results/lequa/{optim}' + for optim in ['mae', 'mrae']: - os.makedirs(result_dir, exist_ok=True) + result_dir = f'results/lequa/{optim}' - for method in METHODS: - - print('Init method', method) + os.makedirs(result_dir, exist_ok=True) - result_path = f'{result_dir}/{method}' - - if os.path.exists(result_path+'.csv'): - print(f'file {result_path}.csv already exist; skipping') - continue + for method in METHODS: - with open(result_path+'.csv', 'wt') as csv: - csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') + print('Init method', method) - dataset = 'T1B' - train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset) - print(f'init {dataset} #instances: {len(train)}') - param_grid, quantifier = new_method(method) + result_path = f'{result_dir}/{method}' - if param_grid is not None: - modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim) + if os.path.exists(result_path+'.csv'): + print(f'file {result_path}.csv already exist; skipping') + continue - modsel.fit(train) - print(f'best params {modsel.best_params_}') - print(f'best score {modsel.best_score_}') - pickle.dump( - (modsel.best_params_, modsel.best_score_,), - open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + with open(result_path+'.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') - quantifier = modsel.best_model() - else: - print('debug mode... skipping model selection') - quantifier.fit(train) + dataset = 'T1B' + train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset) + print(f'init {dataset} #instances: {len(train)}') + param_grid, quantifier = new_method(method) - report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True) - means = report.mean() - report.to_csv(result_path+'.dataframe') - csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') - csv.flush() + if param_grid is not None: + modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim) - show_results(result_path) + modsel.fit(train) + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + pickle.dump( + (modsel.best_params_, modsel.best_score_,), + open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + + quantifier = modsel.best_model() + else: + print('debug mode... skipping model selection') + quantifier.fit(train) + + report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True) + means = report.mean() + report.to_csv(result_path+'.dataframe') + csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() + + show_results(result_path) diff --git a/distribution_matching/method_kdey.py b/distribution_matching/method_kdey.py index 9ab10c9..b3a6ceb 100644 --- a/distribution_matching/method_kdey.py +++ b/distribution_matching/method_kdey.py @@ -27,7 +27,7 @@ class KDEy(AggregativeProbabilisticQuantifier): BANDWIDTH_METHOD = ['auto', 'scott', 'silverman'] ENGINE = ['scipy', 'sklearn', 'statsmodels'] - TARGET = ['min_divergence', 'max_likelihood'] + TARGET = ['min_divergence', 'min_divergence_uniform', 'max_likelihood'] def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2', bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000): @@ -35,7 +35,7 @@ class KDEy(AggregativeProbabilisticQuantifier): f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}' assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}' assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}' - assert divergence=='KLD', 'in this version I will only allow KLD as a divergence' + assert divergence in ['KLD', 'HD', 'JS'], 'in this version I will only allow KLD or squared HD as a divergence' self.classifier = classifier self.val_split = val_split self.divergence = divergence @@ -118,7 +118,6 @@ class KDEy(AggregativeProbabilisticQuantifier): self.classifier, y, posteriors, classes, class_count = cross_generate_predictions( data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs ) - print('classifier fit done') if self.bandwidth == 'auto': self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y) @@ -126,21 +125,22 @@ class KDEy(AggregativeProbabilisticQuantifier): self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)] self.val_posteriors = posteriors - if self.target == 'min_divergence_depr': + if self.target == 'min_divergence_uniform': self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials) self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities] - if self.target == 'min_divergence': + elif self.target == 'min_divergence': self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities] self.class_sample_densities = {} for ci, samples_i in enumerate(self.class_samples): self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T - print('kde fit done') return self def aggregate(self, posteriors: np.ndarray): if self.target == 'min_divergence': return self._target_divergence(posteriors) + elif self.target == 'min_divergence_uniform': + return self._target_divergence_uniform(posteriors) elif self.target == 'max_likelihood': return self._target_likelihood(posteriors) else: @@ -170,6 +170,42 @@ class KDEy(AggregativeProbabilisticQuantifier): # r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) # return r.x + def _target_divergence_uniform(self, posteriors): + # in this variant we evaluate the divergence using a Montecarlo approach + n_classes = len(self.val_densities) + + test_kde = self.get_kde_function(posteriors) + test_likelihood = self.pdf(test_kde, self.samples) + + def f_squared_hellinger(t): + return (np.sqrt(t) - 1)**2 + + def f_jensen_shannon(t): + return -(t+1)*np.log((t+1)/2) + t*np.log(t) + + def fdivergence(pi, qi, f, eps=1e-10): + spi = pi+eps + sqi = qi+eps + return np.mean(f(spi/sqi)*sqi) + + if self.divergence.lower() == 'hd': + f = f_squared_hellinger + elif self.divergence.lower() == 'js': + f = f_jensen_shannon + + def match(prev): + val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities)) + return fdivergence(val_likelihood, test_likelihood, f) + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x + def _target_divergence(self, posteriors): # in this variant we evaluate the divergence using a Montecarlo approach n_classes = len(self.val_densities) @@ -184,6 +220,18 @@ class KDEy(AggregativeProbabilisticQuantifier): smooth_qi = qi+eps return np.mean(np.log(smooth_pi / smooth_qi)) + def squared_hellinger(pi, qi, eps=1e-8): + smooth_pi = pi + eps + smooth_qi = qi + eps + return np.mean((np.sqrt(smooth_pi/smooth_qi)-1)**2) + + # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway + if self.divergence.lower() == 'kld': + fdivergence = kld_monte + elif self.divergence.lower() == 'hd': + fdivergence = squared_hellinger + + def match(prev): # choose the samples according to the prevalence vector # e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2 @@ -202,7 +250,7 @@ class KDEy(AggregativeProbabilisticQuantifier): test_likelihood = np.concatenate( [samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)] ) - return kld_monte(val_likelihood, test_likelihood) + return fdivergence(val_likelihood, test_likelihood) # the initial point is set as the uniform distribution uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) @@ -246,4 +294,5 @@ class KDEy(AggregativeProbabilisticQuantifier): #print('searching for alpha') r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) #print('[optimization ended]') - return r.x \ No newline at end of file + return r.x + diff --git a/distribution_matching/todo.txt b/distribution_matching/todo.txt index 4c37e45..d5747fd 100644 --- a/distribution_matching/todo.txt +++ b/distribution_matching/todo.txt @@ -1,49 +1,21 @@ -Cosa fundamental: -KDE se puede usar para generar 2 distribuciones (una, es un mixture model de KDEs en train condicionados a cada clase, -y el otro es un KDE en test), de las que luego se calculará la divergencia (objetivo a minimizar). Otra opción es -generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo -a maximizar. +1.- No se si sería más facil tomar r=uniforme, y no r=mixture model, simplifica mucho el sampling y tal vez incluso produzca menos error +2.- Por ahora tengo KLD y HD: + - para KLD no he entendido si tengo que añadir el -x + y +3.- Se puede poner la topsoe como una f-divergence? + La topsoe parece que es 2 veces la jensen-shannon divergence, o sea + topsoe(p,q) = kld(p|m) + kld(q|m), con m = (p+q)/2 +4.- Se puede poner la Wasserstein como una f-divergence? +5.- En general, qué relación hay con las "distancias"? -- echar un ojo a los hyperparametros -- hacer dibujitos -- estudiar el caso en que el target es minimizar una divergencia. Posibilidades: - - evaluar los puntos de test solo - - evaluar un APP sobre el simplexo? - - evaluar un UPP sobre el simplexo? (=Montecarlo) - - qué divergencias? HD, topsoe, L1? - - tampoco estoy evaluando en modo kfcv creo... -1) sacar lequa-kfcv y todos los kfcv que puedan tener sentido en tweets 2) implementar el auto - optimización interna para likelihood [ninguno parece funcionar bien] - de todo (e.g., todo el training)? - independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test) - optimización como un parámetro GridSearchQ 6) optimizar kernel? optimizar distancia? -7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría - eliminar el clasificador?): - https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional -8) quitar la ultima dimension en sklearn también? No veo porqué -9) optimizar para RAE en vez de AE? No va bien... 10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con EMD y meterlo en KDEy 11) KDEx? -12) Dirichlet (el método DIR) habría que arreglarlo y mostrar resultados... -13) Test estadisticos. -Notas: -estoy probando a reemplazar el target max_likelihood con un min_divergence: -- como la divergencia entre dos KDEs ahora es en el espacio continuo, no es facil como obtener. Estoy probando - con una evaluación en test, pero el problema es que es overconfident con respecto a la que ha sido obtenida en test. - Otra opción es un MonteCarlo que es lo que estoy probando ahora. Para este experimento he quitado la model selection - del clasificador, y estoy dejando solo la que hace con el bandwidth por agilizarlo. Los resultados KDE-nomonte son un - max_likelihood en igualdad de condiciones (solo bandwidth), KDE-monte1 es un montecarlo con HD a 1000 puntos, y KDE-monte2 - es lo mismo pero con 5000 puntos; ambos funcionan mal. KDE-monte1 y KDE-monte2 los voy a borrar. - Ahora estoy probando con KDE-monte3, lo mismo pero con una L2 como - divergencia. Parece mucho más parecido a KDE-nomonte (pero sigue siendo algo peor) - - probar con más puntos (KDE-monte4 es a 5000 puntos) - - habría que probar con topsoe (KDE-monte5) - - probar con optimización del LR (KDE-monte6 y con kfcv) - - probar con L1 en vez de L2 (KDE-monte7 con 5000 puntos y sin LR) - - tal vez habría que probar con la L2, que funciona bien, en el min_divergence que evaluaba en test, o test+train \ No newline at end of file diff --git a/distribution_matching/tweets_experiments.py b/distribution_matching/tweets_experiments.py index 16b18de..3f8230c 100644 --- a/distribution_matching/tweets_experiments.py +++ b/distribution_matching/tweets_experiments.py @@ -15,70 +15,71 @@ if __name__ == '__main__': qp.environ['N_JOBS'] = -1 n_bags_val = 250 n_bags_test = 1000 - optim = 'mae' - result_dir = f'results/tweet/{optim}' + for optim in ['mae', 'mrae']: - os.makedirs(result_dir, exist_ok=True) + result_dir = f'results/tweet/{optim}' - for method in METHODS: - - print('Init method', method) + os.makedirs(result_dir, exist_ok=True) - global_result_path = f'{result_dir}/{method}' - - if not os.path.exists(global_result_path+'.csv'): - with open(global_result_path+'.csv', 'wt') as csv: - csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') + for method in METHODS: - with open(global_result_path+'.csv', 'at') as csv: - # four semeval dataset share the training, so it is useless to optimize hyperparameters four times; - # this variable controls that the mod sel has already been done, and skip this otherwise - semeval_trained = False + print('Init method', method) - for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: - print('init', dataset) + global_result_path = f'{result_dir}/{method}' - local_result_path = global_result_path + '_' + dataset - if os.path.exists(local_result_path+'.dataframe'): - print(f'result file {local_result_path}.dataframe already exist; skipping') - continue - - with qp.util.temp_seed(SEED): + if not os.path.exists(global_result_path+'.csv'): + with open(global_result_path+'.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') - is_semeval = dataset.startswith('semeval') + with open(global_result_path+'.csv', 'at') as csv: + # four semeval dataset share the training, so it is useless to optimize hyperparameters four times; + # this variable controls that the mod sel has already been done, and skip this otherwise + semeval_trained = False - if not is_semeval or not semeval_trained: + for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: + print('init', dataset) - param_grid, quantifier = new_method(method) + local_result_path = global_result_path + '_' + dataset + if os.path.exists(local_result_path+'.dataframe'): + print(f'result file {local_result_path}.dataframe already exist; skipping') + continue - # model selection - data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True) + with qp.util.temp_seed(SEED): - protocol = UPP(data.test, repeats=n_bags_val) - modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim) + is_semeval = dataset.startswith('semeval') - modsel.fit(data.training) - print(f'best params {modsel.best_params_}') - print(f'best score {modsel.best_score_}') - pickle.dump( - (modsel.best_params_, modsel.best_score_,), - open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + if not is_semeval or not semeval_trained: - quantifier = modsel.best_model() + param_grid, quantifier = new_method(method) - if is_semeval: - semeval_trained = True - - else: - print(f'model selection for {dataset} already done; skipping') + # model selection + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True) - data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) - quantifier.fit(data.training) - protocol = UPP(data.test, repeats=n_bags_test) - report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True) - report.to_csv(f'{local_result_path}.dataframe') - means = report.mean() - csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') - csv.flush() + protocol = UPP(data.test, repeats=n_bags_val) + modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim) - show_results(global_result_path) + modsel.fit(data.training) + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + pickle.dump( + (modsel.best_params_, modsel.best_score_,), + open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + + quantifier = modsel.best_model() + + if is_semeval: + semeval_trained = True + + else: + print(f'model selection for {dataset} already done; skipping') + + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) + quantifier.fit(data.training) + protocol = UPP(data.test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True) + report.to_csv(f'{local_result_path}.dataframe') + means = report.mean() + csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() + + show_results(global_result_path)