diff --git a/plot_example.py b/plot_example.py
deleted file mode 100644
index 346455e..0000000
--- a/plot_example.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from sklearn.model_selection import GridSearchCV
-import numpy as np
-import quapy as qp
-from sklearn.linear_model import LogisticRegression
-
-sample_size = 500
-qp.environ['SAMPLE_SIZE'] = sample_size
-
-
-
-def gen_data():
-
-    data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
-
-    models = [
-        qp.method.aggregative.CC,
-        qp.method.aggregative.ACC,
-        qp.method.aggregative.PCC,
-        qp.method.aggregative.PACC,
-        qp.method.aggregative.HDy,
-        qp.method.aggregative.EMQ,
-        qp.method.meta.ECC,
-        qp.method.meta.EACC,
-        qp.method.meta.EHDy,
-    ]
-
-    method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
-    for Quantifier in models:
-        print(f'training {Quantifier.__name__}')
-        lr = LogisticRegression(max_iter=1000, class_weight='balanced')
-        # lr = GridSearchCV(lr, param_grid={'C':np.logspace(-3,3,7)}, n_jobs=-1)
-        model = Quantifier(lr).fit(data.training)
-        true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(
-            model, data.test, sample_size, n_repetitions=20, n_prevpoints=11)
-
-        method_names.append(Quantifier.__name__)
-        true_prevs.append(true_prev)
-        estim_prevs.append(estim_prev)
-        tr_prevs.append(data.training.prevalence())
-
-    return method_names, true_prevs, estim_prevs, tr_prevs
-
-method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('./plots/plot_data.pkl', gen_data)
-
-qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=11, savepath='./plots/err_drift.png')
-qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots/bin_diag.png')
-qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
-qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=11, savepath='./plots/bin_bias_bin.png')
diff --git a/test.py b/test.py
deleted file mode 100644
index 3d664b3..0000000
--- a/test.py
+++ /dev/null
@@ -1,222 +0,0 @@
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import GridSearchCV
-from sklearn.svm import LinearSVC, LinearSVR
-import quapy as qp
-import quapy.functional as F
-import sys
-import numpy as np
-
-from NewMethods.methods import AveragePoolQuantification
-from classification.methods import PCALR
-from data import Dataset
-from method.meta import EPACC
-from quapy.model_selection import GridSearchQ
-from tqdm import tqdm
-import pandas as pd
-
-sample_size=100
-qp.environ['SAMPLE_SIZE'] = sample_size
-
-np.random.seed(0)
-
-nfolds=5
-nrepeats=1
-
-df = pd.DataFrame(columns=['dataset', 'method', 'mse'])
-for datasetname in qp.datasets.UCI_DATASETS:
-    collection = qp.datasets.fetch_UCILabelledCollection(datasetname, verbose=False)
-    scores = []
-    pbar = tqdm(Dataset.kFCV(collection, nfolds=nfolds, nrepeats=nrepeats), total=nfolds*nrepeats)
-    for data in pbar:
-        pbar.set_description(f'{data.name}')
-        # learner = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid={'C': np.logspace(-3,3,7)}, n_jobs=-1)
-        learner = LogisticRegression(class_weight='balanced')
-        # model = qp.method.aggregative.CC(learner)
-        model = qp.method.meta.EHDy(learner, size=30, red_size=15, verbose=False)
-        model.fit(data.training)
-        err = qp.evaluation.artificial_sampling_eval(model, data.test, sample_size, n_prevpoints=101, n_jobs=-1,
-                                                     error_metric='mse', verbose=False)
-        scores.append(err)
-
-    score = np.mean(scores)
-    df = df.append({
-        'dataset': datasetname,
-        'method': model.__class__.__name__,
-        'mse': score
-    }, ignore_index=True)
-    print(df)
-
-sys.exit(0)
-
-
-
-#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
-param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
-max_evaluations = 500
-
-sample_size = qp.environ['SAMPLE_SIZE']
-binary = False
-svmperf_home = './svm_perf_quantification'
-
-if binary:
-    #dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
-    dataset = qp.datasets.fetch_UCIDataset('german', verbose=True)
-    #qp.data.preprocessing.index(dataset, inplace=True)
-
-else:
-    dataset = qp.datasets.fetch_twitter('gasp', for_model_selection=True, min_df=5, pickle=True)
-    #dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3)
-
-print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
-
-
-# training a quantifier
-# learner = LogisticRegression(max_iter=1000)
-#model = qp.method.aggregative.ClassifyAndCount(learner)
-# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
-# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
-# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
-# model = qp.method.aggregative.HellingerDistanceY(learner)
-# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
-# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
-# model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
-
-#learner = PCALR()
-#learner = NeuralClassifierTrainer(CNNnet(dataset.vocabulary_size, dataset.n_classes))
-#print(learner.get_params())
-#model = qp.method.meta.QuaNet(learner, sample_size, device='cpu')
-
-#learner = GridSearchCV(LogisticRegression(max_iter=1000), param_grid=param_grid, n_jobs=-1, verbose=1)
-#learner = LogisticRegression(max_iter=1000)
-# model = qp.method.aggregative.ClassifyAndCount(learner)
-
-param_mod_sel = {
-    'sample_size': 100,
-    'n_prevpoints': 21,
-    'n_repetitions': 5,
-    'verbose': False
-}
-common = {
-    'max_sample_size': 50,
-    'n_jobs': -1,
-    'param_grid': {'C': np.logspace(0,2,2), 'class_weight': ['balanced']},
-    'param_mod_sel': param_mod_sel,
-    'val_split': 0.4,
-    'min_pos': 10,
-    'size':6,
-    'red_size':3
-}
-
-# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
-# will be skipped (by setting hyperparameters to None)
-model = EPACC(LogisticRegression(max_iter=100), optim='mrae', policy='mrae', **common)
-
-"""    
-Problemas:
-- La interfaz es muy fea, hay que conocer practicamente todos los detalles así que no ahorra nada con respecto a crear
-    un objeto con otros anidados dentro
-- El fit genera las prevalences random, y esto hace que despues de la model selection, un nuevo fit tire todo el trabajo
-    hecho.
-- El fit de un GridSearcQ tiene dentro un best_estimator, pero después de la model selection, hacer fit otra vez sobre
-    este objeto no se limita a re-entrenar el modelo con los mejores parámetros, sino que inicia una nueva búsqueda 
-    en modo grid search.
-- Posible solución (no vale): sería hacer directamente model selection con el benchmark final, aunque esto haría que los hyper-
-    parámetros se buscasen en un conjunto diferente del resto de models....
-- Posible solución: 
-    - Elegir las prevalences en init
-    - 
-- Problema: el parámetro val_split es muy ambiguo en todo el framework. Por ejemplo, en EPACC podría ser un float que,
-    en el caso de un GridSearchQ podría referir al split de validación para los hyperparámetros o al split que usa PACC
-    para encontrar los parámetros...
-"""
-
-# regressor = LinearSVR(max_iter=10000)
-# param_grid = {'C': np.logspace(-1,3,5)}
-# model = AveragePoolQuantification(regressor, sample_size, trials=5000, n_components=500, zscore=False)
-
-# model = qp.method.meta.EHDy(learner, param_grid=param_grid, optim='mae',
-#                           sample_size=sample_size, eval_budget=max_evaluations//10, n_jobs=-1)
-#model = qp.method.aggregative.ClassifyAndCount(learner)
-
-# model = qp.method.meta.QuaNet(PCALR(n_components=100, max_iter=1000),
-#                                sample_size=100,
-#                                patience=10,
-#                                tr_iter_per_poch=500, va_iter_per_poch=100, #lstm_nlayers=2, lstm_hidden_size=64,
-#                                ff_layers=[500, 250, 50],
-                               # checkpointdir='./checkpoint', device='cuda')
-
-if qp.isbinary(model) and not qp.isbinary(dataset):
-    model = qp.method.aggregative.OneVsAll(model)
-
-
-# Model fit and Evaluation on the test data
-# ----------------------------------------------------------------------------
-
-print(f'fitting model {model.__class__.__name__}')
-#train, val = dataset.training.split_stratified(0.6)
-#model.fit(train, val_split=val)
-qp.SAMPLE=1
-qp.environ['SAMPLE_SIZE']=2
-model.fit(dataset.training)
-
-
-
-
-
-# estimating class prevalences
-# print('quantifying')
-# prevalences_estim = model.quantify(dataset.test.instances)
-# prevalences_true  = dataset.test.prevalence()
-#
-# evaluation (one single prediction)
-# error = qp.error.mae(prevalences_true, prevalences_estim)
-#
-# print(f'Evaluation in test (1 eval)')
-# print(f'true prevalence {F.strprev(prevalences_true)}')
-# print(f'estim prevalence {F.strprev(prevalences_estim)}')
-# print(f'mae={error:.3f}')
-
-
-# Model fit and Evaluation according to the artificial sampling protocol
-# ----------------------------------------------------------------------------
-
-n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
-n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
-print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n'
-      f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
-      f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
-
-true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)
-
-#qp.error.SAMPLE_SIZE = sample_size
-print(f'Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
-for error in qp.error.QUANTIFICATION_ERROR:
-    score = error(true_prev, estim_prev)
-    print(f'{error.__name__}={score:.5f}')
-
-sys.exit(0)
-# Model selection and Evaluation according to the artificial sampling protocol
-# ----------------------------------------------------------------------------
-
-model_selection = GridSearchQ(model,
-                              param_grid=param_grid,
-                              sample_size=sample_size,
-                              eval_budget=max_evaluations//10,
-                              error='mae',
-                              refit=True,
-                              verbose=True,
-                              timeout=60*60)
-
-model = model_selection.fit(dataset.training, val_split=0.3)
-#model = model_selection.fit(train, validation=val)
-print(f'Model selection: best_params = {model_selection.best_params_}')
-print(f'param scores:')
-for params, score in model_selection.param_scores_.items():
-    print(f'\t{params}: {score:.5f}')
-
-true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, sample_size, n_prevpoints)
-
-print(f'After model selection: Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
-for error in qp.error.QUANTIFICATION_ERROR:
-    score = error(true_prev, estim_prev)
-    print(f'{error.__name__}={score:.5f}')
\ No newline at end of file