diff --git a/LeQua2022/baselines_T2.py b/LeQua2022/_depr_baselines_T2.py
similarity index 52%
rename from LeQua2022/baselines_T2.py
rename to LeQua2022/_depr_baselines_T2.py
index e119aa4..2b02e7f 100644
--- a/LeQua2022/baselines_T2.py
+++ b/LeQua2022/_depr_baselines_T2.py
@@ -22,10 +22,10 @@ import constants
 
 def baselines():
     yield CC(LR(n_jobs=-1)), "CC"
-    yield ACC(LR(n_jobs=-1)), "ACC"
-    yield PCC(LR(n_jobs=-1)), "PCC"
-    yield PACC(LR(n_jobs=-1)), "PACC"
-    yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
+    # yield ACC(LR(n_jobs=-1)), "ACC"
+    # yield PCC(LR(n_jobs=-1)), "PCC"
+    # yield PACC(LR(n_jobs=-1)), "PACC"
+    # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
     # yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
     # yield MLPE(), "MLPE"
 
@@ -34,75 +34,15 @@ def main(args):
 
     models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
 
-    path_dev_vectors = os.path.join(args.datadir, 'dev_documents')
-    path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv')
-    path_train = os.path.join(args.datadir, 'training_documents.txt')
+    path_dev_vectors = os.path.join(args.datadir, 'dev_samples')
+    path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt')
+    path_train = os.path.join(args.datadir, 'training_data.txt')
 
     qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
 
     train = LabelledCollection.load(path_train, load_raw_documents)
 
-    if args.mode == 'tfidf1':
-        tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True)
-    if args.mode == 'tfidf2':
-        tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))
-    if args.mode == 'tfidf3':
-        tfidf = Pipeline([
-            ('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True)),
-            ('svd', TruncatedSVD(n_components=300))
-        ])
-    if args.mode == 'tfidf4':
-        tfidf = Pipeline([
-            ('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))),
-            ('svd', TruncatedSVD(n_components=300))
-        ])
-    if args.mode == 'glove1':
-        tfidf = Pipeline([
-            ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
-            ('zscore', StandardScaler())
-        ])
-    if args.mode == 'glove2':
-        tfidf = WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')
-    if args.mode == 'glove3':
-        vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
-        tfidf = Pipeline([
-            ('tfidf', vect),
-            ('embedding', TfidfWordEmbeddingTransformer(
-                wordset_name='glove',
-                features_call=vect.get_feature_names_out,
-                path='/mnt/1T/Datasets/GloVe')),
-            ('zscore', StandardScaler())
-        ])
-    if args.mode == 'glove4':
-        vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
-        tfidf = Pipeline([
-            ('tfidf', vect),
-            ('embedding', TfidfWordEmbeddingTransformer(
-                wordset_name='glove',
-                features_call=vect.get_feature_names_out,
-                path='/mnt/1T/Datasets/GloVe'))
-        ])
-    if args.mode == 'wce1':
-        tfidf = WordClassEmbeddingsTransformer()
-    if args.mode == 'wce2':
-        glove = Pipeline([
-            ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
-            ('zscore', StandardScaler())
-        ])
-        wce = WordClassEmbeddingsTransformer()
-        tfidf = ConcatenateEmbeddingsTransformer([glove, wce])
-    if args.mode == 'wce3':
-        glove = Pipeline([
-            ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
-            ('zscore', StandardScaler())
-        ])
-        wce = WordClassEmbeddingsTransformer()
-        tfidf = Pipeline([
-            ('glove-wce', ConcatenateEmbeddingsTransformer([glove, wce])),
-            ('svd', TruncatedSVD(n_components=300))
-            ])
-    target_metric = qp.error.mrae
-
+    tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))
     train.instances = tfidf.fit_transform(*train.Xy)
 
     print(f'number of classes: {len(train.classes_)}')
@@ -110,18 +50,18 @@ def main(args):
     print(f'training prevalence: {F.strprev(train.prevalence())}')
     print(f'training matrix shape: {train.instances.shape}')
 
-    param_grid = {
-        'C': np.logspace(-3, 3, 7),
-        'class_weight': ['balanced', None]
-    }
-
     # param_grid = {
-    #     'C': [1],
-    #     'class_weight': ['balanced']
+    #     'C': np.logspace(-3, 3, 7),
+    #     'class_weight': ['balanced', None]
     # }
 
+    param_grid = {
+        'C': [1],
+        'class_weight': ['balanced']
+    }
+
     def gen_samples():
-        return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
+        return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
                                 load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
 
     outs = []
@@ -132,7 +72,7 @@ def main(args):
             param_grid,
             sample_size=None,
             protocol='gen',
-            error=target_metric,  #qp.error.mae,
+            error=qp.error.mrae,
             refit=False,
             verbose=True
         ).fit(train, gen_samples)
@@ -144,8 +84,6 @@ def main(args):
         print(f'saving model in {model_path}')
         pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
 
-    print(tfidf)
-    print(args.mode)
     print(outs)
     with open(f'{args.mode}.{args.task}.txt', 'wt') as foo:
         for line in outs:
@@ -157,26 +95,23 @@ if __name__ == '__main__':
     parser.add_argument('task', metavar='TASK', type=str, choices=['T2A', 'T2B'],
                         help='Task name (T2A, T2B)')
     parser.add_argument('datadir', metavar='DATA-PATH', type=str,
-                        help='Path of the directory containing "dev_prevalences.csv", "training_documents.txt", and '
+                        help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and '
                              'the directory "dev_documents"')
     parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
                         help='Path where to save the models. '
                              'A subdirectory named <task> will be automatically created.')
-    parser.add_argument('mode', metavar='PREPROCESSMODE', type=str,
-                        help='modality of preprocessing')
     args = parser.parse_args()
 
     if not os.path.exists(args.datadir):
         raise FileNotFoundError(f'path {args.datadir} does not exist')
     if not os.path.isdir(args.datadir):
         raise ValueError(f'path {args.datadir} is not a valid directory')
-    if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file')
-    if not os.path.exists(os.path.join(args.datadir, "training_documents.txt")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "training_documents.txt" file')
-    if not os.path.exists(os.path.join(args.datadir, "dev_documents")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder')
+    if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")):
+        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file')
+    if not os.path.exists(os.path.join(args.datadir, "training_data.txt")):
+        raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file')
+    if not os.path.exists(os.path.join(args.datadir, "dev_samples")):
+        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder')
 
     main(args)
 
-    # print('WITHOUT MODEL SELECTION')
diff --git a/LeQua2022/baselines_T1.py b/LeQua2022/baselines.py
similarity index 62%
rename from LeQua2022/baselines_T1.py
rename to LeQua2022/baselines.py
index dd548c2..42ead5e 100644
--- a/LeQua2022/baselines_T1.py
+++ b/LeQua2022/baselines.py
@@ -1,5 +1,7 @@
 import argparse
 import pickle
+
+from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression as LR
 from quapy.method.aggregative import *
 from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
@@ -16,8 +18,8 @@ def baselines():
     yield CC(LR(n_jobs=-1)), "CC"
     # yield ACC(LR(n_jobs=-1)), "ACC"
     # yield PCC(LR(n_jobs=-1)), "PCC"
-    # yield PACC(LR(n_jobs=-1)), "PACC"
-    # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
+    yield PACC(LR(n_jobs=-1)), "PACC"
+    yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
     # yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
     # yield MLPE(), "MLPE"
 
@@ -26,35 +28,40 @@ def main(args):
 
     models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
 
-    path_dev_vectors = os.path.join(args.datadir, 'dev_vectors')
-    path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv')
-    path_train = os.path.join(args.datadir, 'training_vectors.csv')
+    path_dev_vectors = os.path.join(args.datadir, 'dev_samples')
+    path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt')
+    path_train = os.path.join(args.datadir, 'training_data.txt')
 
     qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
 
-    train = LabelledCollection.load(path_train, load_vector_documents)
-    nF = train.instances.shape[1]
+    if args.task in {'T1A', 'T1B'}:
+        train = LabelledCollection.load(path_train, load_vector_documents)
+
+        def gen_samples():
+            return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, load_fn=load_vector_documents)
+    else:
+        train = LabelledCollection.load(path_train, load_raw_documents)
+        tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1, 2))
+        train.instances = tfidf.fit_transform(*train.Xy)
+
+        def gen_samples():
+            return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
+                                    load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
 
     print(f'number of classes: {len(train.classes_)}')
     print(f'number of training documents: {len(train)}')
     print(f'training prevalence: {F.strprev(train.prevalence())}')
     print(f'training matrix shape: {train.instances.shape}')
 
-    # param_grid = {
-    #     'C': np.logspace(-3, 3, 7),
-    #     'class_weight': ['balanced', None]
-    # }
-
     param_grid = {
-        'C': [0.01],
-        'class_weight': ['balanced']
+        'C': np.logspace(-3, 3, 7),
+        'class_weight': ['balanced', None]
     }
-    target_metric = qp.error.mrae
-
-    def gen_samples():
-        return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
-                                load_fn=load_vector_documents, ext='csv')
 
+    # param_grid = {
+    #     'C': [0.01, 0.1, 1],
+    #     'class_weight': ['balanced']
+    # }
 
     for quantifier, q_name in baselines():
         print(f'{q_name}: Model selection')
@@ -63,7 +70,7 @@ def main(args):
             param_grid,
             sample_size=None,
             protocol='gen',
-            error=target_metric,  #qp.error.mae,
+            error=qp.error.mrae,
             refit=False,
             verbose=True
         ).fit(train, gen_samples)
@@ -76,12 +83,12 @@ def main(args):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='LeQua2022 Task T1A/T1B baselines')
-    parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B'],
-                        help='Task name (T1A, T1B)')
+    parser = argparse.ArgumentParser(description='LeQua2022 baselines')
+    parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'],
+                        help='Task name (T1A, T1B, T2A, T2B)')
     parser.add_argument('datadir', metavar='DATA-PATH', type=str,
-                        help='Path of the directory containing "dev_prevalences.csv", "training_vectors.txt", and '
-                             'the directory "dev_vectors"')
+                        help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and '
+                             'the directory "dev_samples"')
     parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
                         help='Path where to save the models. '
                              'A subdirectory named <task> will be automatically created.')
@@ -91,11 +98,11 @@ if __name__ == '__main__':
         raise FileNotFoundError(f'path {args.datadir} does not exist')
     if not os.path.isdir(args.datadir):
         raise ValueError(f'path {args.datadir} is not a valid directory')
-    if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file')
-    if not os.path.exists(os.path.join(args.datadir, "training_vectors.csv")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.csv" file')
-    if not os.path.exists(os.path.join(args.datadir, "dev_vectors")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder')
+    if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")):
+        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file')
+    if not os.path.exists(os.path.join(args.datadir, "training_data.txt")):
+        raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file')
+    if not os.path.exists(os.path.join(args.datadir, "dev_samples")):
+        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder')
 
     main(args)
diff --git a/LeQua2022/constants.py b/LeQua2022/constants.py
index 7a664a9..2c4bc77 100644
--- a/LeQua2022/constants.py
+++ b/LeQua2022/constants.py
@@ -2,18 +2,15 @@ DEV_SAMPLES = 1000
 TEST_SAMPLES = 5000
 
 TXA_SAMPLE_SIZE = 250
-TXB_SAMPLE_SIZE = 250
-
-T1A_SAMPLE_SIZE = 250
-T1B_SAMPLE_SIZE = 1000
-T2A_SAMPLE_SIZE = 250
-T2B_SAMPLE_SIZE = 1000
+TXB_SAMPLE_SIZE = 1000
 
 SAMPLE_SIZE={
-    'T1A': T1A_SAMPLE_SIZE,
-    'T1B': T1B_SAMPLE_SIZE,
-    'T2A': T2A_SAMPLE_SIZE,
-    'T2B': T2B_SAMPLE_SIZE
+    'TXA': TXA_SAMPLE_SIZE,
+    'TXB': TXB_SAMPLE_SIZE,
+    'T1A': TXA_SAMPLE_SIZE,
+    'T1B': TXB_SAMPLE_SIZE,
+    'T2A': TXA_SAMPLE_SIZE,
+    'T2B': TXB_SAMPLE_SIZE
 }
 
 ERROR_TOL = 1E-3
diff --git a/LeQua2022/data.py b/LeQua2022/data.py
index e581096..6d09db9 100644
--- a/LeQua2022/data.py
+++ b/LeQua2022/data.py
@@ -34,53 +34,37 @@ def load_raw_unlabelled_documents(path, vectorizer=None):
     return documents, None
 
 
-# def load_vector_documents(path, nF=None):
-#     X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF, zero_based=True)
-#     y = y.astype(int)
-#     return X, y
-
 def load_vector_documents(path):
     D = pd.read_csv(path).to_numpy(dtype=np.float)
     labelled = D.shape[1] == 301
     if labelled:
-        X, y = D[:,:300], D[:,-1].astype(np.int).flatten()
+        X, y = D[:,1:], D[:,0].astype(np.int).flatten()
     else:
         X, y = D, None
     return X, y
 
 
-def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, ext:str, load_fn, **load_kwargs):
+def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs):
     true_prevs = ResultSubmission.load(ground_truth_path)
     for id, prevalence in true_prevs.iterrows():
-        sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs)
+        sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
         yield (id, sample, prevalence) if return_id else (sample, prevalence)
 
 
-def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, ext:str, load_fn, **load_kwargs):
-    nsamples = len(glob(os.path.join(path_dir, f'*.{ext}')))
+def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs):
+    nsamples = len(glob(os.path.join(path_dir, f'*.txt')))
     for id in range(nsamples):
-        sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs)
+        sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
         yield (id, sample) if return_id else sample
 
 
-def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, ext='txt', load_fn=load_vector_documents, **load_kwargs):
+def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=False, load_fn=load_vector_documents, **load_kwargs):
     if ground_truth_path is None:
         # the generator function returns tuples (docid:str, sample:csr_matrix or str)
-        gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, ext, load_fn, **load_kwargs)
+        gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs)
     else:
         # the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
-        gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, ext, load_fn, **load_kwargs)
-    for r in gen_fn:
-        yield r
-
-
-def genSVD_load_samples_T1(load_fn, path_dir:str, nF:int, ground_truth_path:str = None, return_id=True):
-    if ground_truth_path is None:
-        # the generator function returns tuples (filename:str, sample:csr_matrix)
-        gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, nF=nF)
-    else:
-        # the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
-        gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, nF=nF)
+        gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs)
     for r in gen_fn:
         yield r
 
@@ -214,19 +198,19 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub
         raise ValueError(f'these result files are not comparable since the categories are different: '
                          f'true={true_prevs.n_categories} categories vs. '
                          f'predictions={predicted_prevs.n_categories} categories')
-    ae, rae = [], []
+    rae, ae = [], []
     for sample_id, true_prevalence in true_prevs.iterrows():
         pred_prevalence = predicted_prevs.prevalence(sample_id)
-        ae.append(qp.error.ae(true_prevalence, pred_prevalence))
         rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
+        ae.append(qp.error.ae(true_prevalence, pred_prevalence))
 
-    ae = np.asarray(ae)
     rae = np.asarray(rae)
+    ae = np.asarray(ae)
 
     if average:
-        return ae.mean(), rae.mean()
+        return rae.mean(), ae.mean()
     else:
-        return ae, rae
+        return rae, ae
 
 
 
diff --git a/LeQua2022/evaluate.py b/LeQua2022/evaluate.py
index 6095fd9..c21e368 100644
--- a/LeQua2022/evaluate.py
+++ b/LeQua2022/evaluate.py
@@ -8,21 +8,20 @@ LeQua2022 Official evaluation script
 """
 
 def main(args):
-    if args.task in {'T1A', 'T2A'}:
-        qp.environ['SAMPLE_SIZE'] = constants.TXA_SAMPLE_SIZE
-    if args.task in {'T1B', 'T2B'}:
-        qp.environ['SAMPLE_SIZE'] = constants.TXB_SAMPLE_SIZE
+
+    sample_size = constants.SAMPLE_SIZE[args.task]
+
     true_prev = ResultSubmission.load(args.true_prevalences)
     pred_prev = ResultSubmission.load(args.pred_prevalences)
-    mae, mrae = evaluate_submission(true_prev, pred_prev)
-    print(f'MAE: {mae:.4f}')
+
+    mrae, mae = evaluate_submission(true_prev, pred_prev, sample_size)
     print(f'MRAE: {mrae:.4f}')
+    print(f'MAE: {mae:.4f}')
 
     if args.output is not None:
-        qp.util.create_parent_dir(args.output)
         with open(args.output, 'wt') as foo:
-            foo.write(f'MAE: {mae:.4f}\n')
             foo.write(f'MRAE: {mrae:.4f}\n')
+            foo.write(f'MAE: {mae:.4f}\n')
 
 
 if __name__=='__main__':
@@ -37,4 +36,7 @@ if __name__=='__main__':
                         help='Path where to store the evaluation scores')
     args = parser.parse_args()
 
+    if args.output is not None:
+        qp.util.create_parent_dir(args.output)
+
     main(args)
diff --git a/LeQua2022/format_checker.py b/LeQua2022/format_checker.py
index 2bf5cd9..d9ae549 100644
--- a/LeQua2022/format_checker.py
+++ b/LeQua2022/format_checker.py
@@ -1,8 +1,6 @@
 import argparse
-import quapy as qp
-from data import ResultSubmission, evaluate_submission
-import constants
-import os
+from data import ResultSubmission
+
 
 """
 LeQua2022 Official format-checker script 
@@ -13,9 +11,9 @@ def main(args):
         ResultSubmission.check_file_format(args.prevalence_file)
     except Exception as e:
         print(e)
-        print('Format check: not passed')
+        print('Format check: [not passed]')
     else:
-        print('Format check: passed')
+        print('Format check: [passed]')
 
 
 if __name__=='__main__':
diff --git a/LeQua2022/predict.py b/LeQua2022/predict.py
index c02d94b..b014468 100644
--- a/LeQua2022/predict.py
+++ b/LeQua2022/predict.py
@@ -1,7 +1,6 @@
 import argparse
 import quapy as qp
 from data import ResultSubmission
-import constants
 import os
 import pickle
 from tqdm import tqdm
@@ -27,7 +26,7 @@ def main(args):
 
     # predictions
     predictions = ResultSubmission()
-    for sampleid, sample in tqdm(gen_load_samples(args.samples, args.nf), desc='predicting', total=nsamples):
+    for sampleid, sample in tqdm(gen_load_samples(args.samples, return_id=True, load_fn=), desc='predicting', total=nsamples):
         predictions.add(sampleid, model.quantify(sample))
 
     # saving
diff --git a/quapy/data/base.py b/quapy/data/base.py
index 7a8df5c..3cb2392 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -9,111 +9,6 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
 from quapy.functional import artificial_prevalence_sampling, strprev
 
 
-# class Sampling:
-#
-#     @abstractmethod
-#     def load(cls, path: str, loader_func: callable, classes=None): ...
-#
-#     @abstractmethod
-#     @property
-#     def __len__(self): ...
-#
-#     @abstractmethod
-#     @property
-#     def prevalence(self): ...
-#
-#     @abstractmethod
-#     @property
-#     def n_classes(self):
-#
-#     @property
-#     def binary(self):
-#         return self.n_classes == 2
-#
-#     def uniform_sampling_index(self, size):
-#         return np.random.choice(len(self), size, replace=False)
-#
-#     def uniform_sampling(self, size):
-#         unif_index = self.uniform_sampling_index(size)
-#         return self.sampling_from_index(unif_index)
-#
-#     def sampling(self, size, *prevs, shuffle=True):
-#         prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
-#         return self.sampling_from_index(prev_index)
-#
-#     def sampling_from_index(self, index):
-#         documents = self.instances[index]
-#         labels = self.labels[index]
-#         return LabelledCollection(documents, labels, classes_=self.classes_)
-#
-#     def split_stratified(self, train_prop=0.6, random_state=None):
-#         # with temp_seed(42):
-#         tr_docs, te_docs, tr_labels, te_labels = \
-#             train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
-#                              random_state=random_state)
-#         return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
-#
-#     def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
-#         dimensions = self.n_classes
-#         for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
-#             yield self.sampling(sample_size, *prevs)
-#
-#     def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
-#         dimensions = self.n_classes
-#         for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
-#             yield self.sampling_index(sample_size, *prevs)
-#
-#     def natural_sampling_generator(self, sample_size, repeats=100):
-#         for _ in range(repeats):
-#             yield self.uniform_sampling(sample_size)
-#
-#     def natural_sampling_index_generator(self, sample_size, repeats=100):
-#         for _ in range(repeats):
-#             yield self.uniform_sampling_index(sample_size)
-#
-#     def __add__(self, other):
-#         if other is None:
-#             return self
-#         elif issparse(self.instances) and issparse(other.instances):
-#             join_instances = vstack([self.instances, other.instances])
-#         elif isinstance(self.instances, list) and isinstance(other.instances, list):
-#             join_instances = self.instances + other.instances
-#         elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray):
-#             join_instances = np.concatenate([self.instances, other.instances])
-#         else:
-#             raise NotImplementedError('unsupported operation for collection types')
-#         labels = np.concatenate([self.labels, other.labels])
-#         return LabelledCollection(join_instances, labels)
-#
-#     @property
-#     def Xy(self):
-#         return self.instances, self.labels
-#
-#     def stats(self, show=True):
-#         ninstances = len(self)
-#         instance_type = type(self.instances[0])
-#         if instance_type == list:
-#             nfeats = len(self.instances[0])
-#         elif instance_type == np.ndarray or issparse(self.instances):
-#             nfeats = self.instances.shape[1]
-#         else:
-#             nfeats = '?'
-#         stats_ = {'instances': ninstances,
-#                   'type': instance_type,
-#                   'features': nfeats,
-#                   'classes': self.classes_,
-#                   'prevs': strprev(self.prevalence())}
-#         if show:
-#             print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
-#                   f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
-#         return stats_
-#
-#     def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
-#         kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
-#         for train_index, test_index in kf.split(*self.Xy):
-#             train = self.sampling_from_index(train_index)
-#             test = self.sampling_from_index(test_index)
-#             yield train, test
 
 class LabelledCollection:
     '''
@@ -146,8 +41,8 @@ class LabelledCollection:
         self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
 
     @classmethod
-    def load(cls, path: str, loader_func: callable, classes=None):
-        return LabelledCollection(*loader_func(path), classes)
+    def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
+        return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
 
     def __len__(self):
         return self.instances.shape[0]