cleaning stuff from LeQua2022 branch

2021-12-15 16:57:13 +01:00 · 2021-12-15 16:57:13 +01:00 · cfdf2e35bd
parent e64a6e989a
commit cfdf2e35bd
8 changed files with 0 additions and 592 deletions
--- a/LeQua2022/TODO.txt
+++ b/LeQua2022/TODO.txt
@ -1,8 +0,0 @@
-2. tablas?
-3. fetch dataset (download, unzip, etc.)
-5. plots
-11. Quitar las categorias como columnas de los ficheros de prevalences
-12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc)
-13. repair doc of GridSearchQ
-14. reparar la calibracion en LR (lo tuve que quitar para que funcionara GridSearchQ, y lo quité en todos los ficheros)
-15. podria poner que el eval_budget se usase en GridSearchQ con generator function para el progress bar de tqdm
--- a/LeQua2022/_depr_baselines_T2.py
+++ b/LeQua2022/_depr_baselines_T2.py
@ -1,117 +0,0 @@
-import argparse
-import pickle
-
-from sklearn.decomposition import TruncatedSVD
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression as LR
-from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
-
-from LeQua2022.pretrained_embeddings import TfidfWordEmbeddingTransformer, WordEmbeddingAverageTransformer
-from LeQua2022.word_class_embeddings import WordClassEmbeddingsTransformer, ConcatenateEmbeddingsTransformer
-from quapy.method.aggregative import *
-from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
-import quapy.functional as F
-from data import *
-import os
-import constants
-
-
-# LeQua official baselines for task T1A (Binary/Vector) and T1B (Multiclass/Vector)
-# =========================================================
-
-def baselines():
-    yield CC(LR(n_jobs=-1)), "CC"
-    # yield ACC(LR(n_jobs=-1)), "ACC"
-    # yield PCC(LR(n_jobs=-1)), "PCC"
-    # yield PACC(LR(n_jobs=-1)), "PACC"
-    # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
-    # yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
-    # yield MLPE(), "MLPE"
-
-
-def main(args):
-
-    models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
-
-    path_dev_vectors = os.path.join(args.datadir, 'dev_samples')
-    path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt')
-    path_train = os.path.join(args.datadir, 'training_data.txt')
-
-    qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
-
-    train = LabelledCollection.load(path_train, load_raw_documents)
-
-    tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))
-    train.instances = tfidf.fit_transform(*train.Xy)
-
-    print(f'number of classes: {len(train.classes_)}')
-    print(f'number of training documents: {len(train)}')
-    print(f'training prevalence: {F.strprev(train.prevalence())}')
-    print(f'training matrix shape: {train.instances.shape}')
-
-    # param_grid = {
-    #     'C': np.logspace(-3, 3, 7),
-    #     'class_weight': ['balanced', None]
-    # }
-
-    param_grid = {
-        'C': [1],
-        'class_weight': ['balanced']
-    }
-
-    def gen_samples():
-        return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
-                                load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
-
-    outs = []
-    for quantifier, q_name in baselines():
-        print(f'{q_name}: Model selection')
-        quantifier = qp.model_selection.GridSearchQ(
-            quantifier,
-            param_grid,
-            sample_size=None,
-            protocol='gen',
-            error=qp.error.mrae,
-            refit=False,
-            verbose=True
-        ).fit(train, gen_samples)
-
-        print(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
-        outs.append(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
-
-        model_path = os.path.join(models_path, q_name+'.'+args.task+'.pkl')
-        print(f'saving model in {model_path}')
-        pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
-
-    print(outs)
-    with open(f'{args.mode}.{args.task}.txt', 'wt') as foo:
-        for line in outs:
-            foo.write(f'{line}\n')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='LeQua2022 Task T2A/T2B baselines')
-    parser.add_argument('task', metavar='TASK', type=str, choices=['T2A', 'T2B'],
-                        help='Task name (T2A, T2B)')
-    parser.add_argument('datadir', metavar='DATA-PATH', type=str,
-                        help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and '
-                             'the directory "dev_documents"')
-    parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
-                        help='Path where to save the models. '
-                             'A subdirectory named <task> will be automatically created.')
-    args = parser.parse_args()
-
-    if not os.path.exists(args.datadir):
-        raise FileNotFoundError(f'path {args.datadir} does not exist')
-    if not os.path.isdir(args.datadir):
-        raise ValueError(f'path {args.datadir} is not a valid directory')
-    if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file')
-    if not os.path.exists(os.path.join(args.datadir, "training_data.txt")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file')
-    if not os.path.exists(os.path.join(args.datadir, "dev_samples")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder')
-
-    main(args)
-
--- a/LeQua2022/baselines.py
+++ b/LeQua2022/baselines.py
@ -1,108 +0,0 @@
-import argparse
-import pickle
-
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression as LR
-from quapy.method.aggregative import *
-from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
-import quapy.functional as F
-from data import *
-import os
-import constants
-
-
-# LeQua official baselines for task T1A (Binary/Vector) and T1B (Multiclass/Vector)
-# =========================================================
-
-def baselines():
-    yield CC(LR(n_jobs=-1)), "CC"
-    # yield ACC(LR(n_jobs=-1)), "ACC"
-    # yield PCC(LR(n_jobs=-1)), "PCC"
-    yield PACC(LR(n_jobs=-1)), "PACC"
-    yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
-    # yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
-    # yield MLPE(), "MLPE"
-
-
-def main(args):
-
-    models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
-
-    path_dev_vectors = os.path.join(args.datadir, 'dev_samples')
-    path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt')
-    path_train = os.path.join(args.datadir, 'training_data.txt')
-
-    qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
-
-    if args.task in {'T1A', 'T1B'}:
-        train = LabelledCollection.load(path_train, load_vector_documents)
-
-        def gen_samples():
-            return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, load_fn=load_vector_documents)
-    else:
-        train = LabelledCollection.load(path_train, load_raw_documents)
-        tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1, 2))
-        train.instances = tfidf.fit_transform(*train.Xy)
-
-        def gen_samples():
-            return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
-                                    load_fn=load_raw_documents, vectorizer=tfidf)
-
-    print(f'number of classes: {len(train.classes_)}')
-    print(f'number of training documents: {len(train)}')
-    print(f'training prevalence: {F.strprev(train.prevalence())}')
-    print(f'training matrix shape: {train.instances.shape}')
-
-    param_grid = {
-        'C': np.logspace(-3, 3, 7),
-        'class_weight': ['balanced', None]
-    }
-
-    param_grid = {
-        'C': [0.01],
-        'class_weight': ['balanced']
-    }
-
-    for quantifier, q_name in baselines():
-        print(f'{q_name}: Model selection')
-        quantifier = qp.model_selection.GridSearchQ(
-            quantifier,
-            param_grid,
-            sample_size=None,
-            protocol='gen',
-            error=qp.error.mrae,
-            refit=False,
-            verbose=True
-        ).fit(train, gen_samples)
-
-        print(f'{q_name} got MRAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
-
-        model_path = os.path.join(models_path, q_name+'.pkl')
-        print(f'saving model in {model_path}')
-        pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='LeQua2022 baselines')
-    parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'],
-                        help='Task name (T1A, T1B, T2A, T2B)')
-    parser.add_argument('datadir', metavar='DATA-PATH', type=str,
-                        help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and '
-                             'the directory "dev_samples"')
-    parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
-                        help='Path where to save the models. '
-                             'A subdirectory named <task> will be automatically created.')
-    args = parser.parse_args()
-
-    if not os.path.exists(args.datadir):
-        raise FileNotFoundError(f'path {args.datadir} does not exist')
-    if not os.path.isdir(args.datadir):
-        raise ValueError(f'path {args.datadir} is not a valid directory')
-    if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file')
-    if not os.path.exists(os.path.join(args.datadir, "training_data.txt")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file')
-    if not os.path.exists(os.path.join(args.datadir, "dev_samples")):
-        raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder')
-
-    main(args)
--- a/LeQua2022/constants.py
+++ b/LeQua2022/constants.py
@ -1,16 +0,0 @@
-DEV_SAMPLES = 1000
-TEST_SAMPLES = 5000
-
-TXA_SAMPLE_SIZE = 250
-TXB_SAMPLE_SIZE = 1000
-
-SAMPLE_SIZE={
-    'TXA': TXA_SAMPLE_SIZE,
-    'TXB': TXB_SAMPLE_SIZE,
-    'T1A': TXA_SAMPLE_SIZE,
-    'T1B': TXB_SAMPLE_SIZE,
-    'T2A': TXA_SAMPLE_SIZE,
-    'T2B': TXB_SAMPLE_SIZE
-}
-
-ERROR_TOL = 1E-3
--- a/LeQua2022/data.py
+++ b/LeQua2022/data.py
@ -1,222 +0,0 @@
-import os.path
-from typing import List, Tuple, Union
-
-import pandas as pd
-
-import quapy as qp
-import numpy as np
-import sklearn
-import re
-from glob import glob
-
-import constants
-
-
-def load_category_map(path):
-    cat2code = {}
-    with open(path, 'rt') as fin:
-        for line in fin:
-            category, code = line.split()
-            cat2code[category] = int(code)
-    code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x:x[1])]
-    return cat2code, code2cat
-
-
-def load_raw_documents(path, vectorizer=None):
-    df = pd.read_csv(path)
-    documents = list(df["text"].values)
-    if vectorizer:
-        documents = vectorizer.transform(documents)
-    labels = None
-    if "label" in df.columns:
-        labels = df["label"].values.astype(np.int)
-    return documents, labels
-
-
-def load_vector_documents(path):
-    D = pd.read_csv(path).to_numpy(dtype=np.float)
-    labelled = D.shape[1] == 301
-    if labelled:
-        X, y = D[:,1:], D[:,0].astype(np.int).flatten()
-    else:
-        X, y = D, None
-    return X, y
-
-
-def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs):
-    true_prevs = ResultSubmission.load(ground_truth_path)
-    for id, prevalence in true_prevs.iterrows():
-        sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
-        yield (id, sample, prevalence) if return_id else (sample, prevalence)
-
-
-def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs):
-    nsamples = len(glob(os.path.join(path_dir, f'*.txt')))
-    for id in range(nsamples):
-        sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
-        yield (id, sample) if return_id else sample
-
-
-def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=False, load_fn=load_vector_documents, **load_kwargs):
-    if ground_truth_path is None:
-        # the generator function returns tuples (docid:str, sample:csr_matrix or str)
-        gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs)
-    else:
-        # the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
-        gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs)
-    for r in gen_fn:
-        yield r
-
-
-class ResultSubmission:
-
-    def __init__(self):
-        self.df = None
-
-    def __init_df(self, categories:int):
-        if not isinstance(categories, int) or categories < 2:
-            raise TypeError('wrong format for categories: an int (>=2) was expected')
-        df = pd.DataFrame(columns=list(range(categories)))
-        df.index.set_names('id', inplace=True)
-        self.df = df
-
-    @property
-    def n_categories(self):
-        return len(self.df.columns.values)
-
-    def add(self, sample_id:int, prevalence_values:np.ndarray):
-        if not isinstance(sample_id, int):
-            raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}')
-        if not isinstance(prevalence_values, np.ndarray):
-            raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}')
-        if self.df is None:
-            self.__init_df(categories=len(prevalence_values))
-        if sample_id in self.df.index.values:
-            raise ValueError(f'error: prevalence values for "{sample_id}" already added')
-        if prevalence_values.ndim!=1 and prevalence_values.size != self.n_categories:
-            raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')
-        if (prevalence_values<0).any() or (prevalence_values>1).any():
-            raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"')
-        if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL:
-            raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"'
-                             f'(error tolerance {constants.ERROR_TOL})')
-
-        self.df.loc[sample_id] = prevalence_values
-
-    def __len__(self):
-        return len(self.df)
-
-    @classmethod
-    def load(cls, path: str) -> 'ResultSubmission':
-        df = ResultSubmission.check_file_format(path)
-        r = ResultSubmission()
-        r.df = df
-        return r
-
-    def dump(self, path:str):
-        ResultSubmission.check_dataframe_format(self.df)
-        self.df.to_csv(path)
-
-    def prevalence(self, sample_id:int):
-        sel = self.df.loc[sample_id]
-        if sel.empty:
-            return None
-        else:
-            return sel.values.flatten()
-
-    def iterrows(self):
-        for index, row in self.df.iterrows():
-            prevalence = row.values.flatten()
-            yield index, prevalence
-
-    @classmethod
-    def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
-        try:
-            df = pd.read_csv(path, index_col=0)
-        except Exception as e:
-            print(f'the file {path} does not seem to be a valid csv file. ')
-            print(e)
-        return ResultSubmission.check_dataframe_format(df, path=path)
-
-    @classmethod
-    def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
-        hint_path = ''  # if given, show the data path in the error message
-        if path is not None:
-            hint_path = f' in {path}'
-
-        if df.index.name != 'id' or len(df.columns) < 2:
-            raise ValueError(f'wrong header{hint_path}, '
-                             f'the format of the header should be "id,0,...,n-1", '
-                             f'where n is the number of categories')
-        if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))):
-            raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n-1, '
-                             f'where n is the number of categories')
-        if df.empty:
-            raise ValueError(f'error{hint_path}: results file is empty')
-        elif len(df) != constants.DEV_SAMPLES and len(df) != constants.TEST_SAMPLES:
-            raise ValueError(f'wrong number of prevalence values found{hint_path}; '
-                             f'expected {constants.DEV_SAMPLES} for development sets and '
-                             f'{constants.TEST_SAMPLES} for test sets; found {len(df)}')
-
-        ids = set(df.index.values)
-        expected_ids = set(range(len(df)))
-        if ids != expected_ids:
-            missing = expected_ids - ids
-            if missing:
-                raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}')
-            unexpected = ids - expected_ids
-            if unexpected:
-                raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}')
-
-        for category_id in df.columns:
-            if (df[category_id] < 0).any() or (df[category_id] > 1).any():
-                raise ValueError(f'error{hint_path} column "{category_id}" contains values out of range [0,1]')
-
-        prevs = df.values
-        round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL
-        if round_errors.any():
-            raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '
-                              f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), '
-                              f'probably due to some rounding errors.')
-
-        return df
-
-
-def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True):
-    if sample_size is None:
-        if qp.environ['SAMPLE_SIZE'] is None:
-            raise ValueError('Relative Absolute Error cannot be computed: '
-                             'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
-        else:
-            sample_size = qp.environ['SAMPLE_SIZE']
-
-    if len(true_prevs) != len(predicted_prevs):
-        raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries '
-                         f'while the file of predictions contain {len(predicted_prevs)} entries')
-    if true_prevs.n_categories != predicted_prevs.n_categories:
-        raise ValueError(f'these result files are not comparable since the categories are different: '
-                         f'true={true_prevs.n_categories} categories vs. '
-                         f'predictions={predicted_prevs.n_categories} categories')
-    rae, ae = [], []
-    for sample_id, true_prevalence in true_prevs.iterrows():
-        pred_prevalence = predicted_prevs.prevalence(sample_id)
-        rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
-        ae.append(qp.error.ae(true_prevalence, pred_prevalence))
-
-    rae = np.asarray(rae)
-    ae = np.asarray(ae)
-
-    if average:
-        return rae.mean(), ae.mean()
-    else:
-        return rae, ae
-
-
-
-
-
-
-
-
-
-
--- a/LeQua2022/evaluate.py
+++ b/LeQua2022/evaluate.py
@ -1,42 +0,0 @@
-import argparse
-import quapy as qp
-from data import ResultSubmission, evaluate_submission
-import constants
-
-"""
-LeQua2022 Official evaluation script 
-"""
-
-def main(args):
-
-    sample_size = constants.SAMPLE_SIZE[args.task]
-
-    true_prev = ResultSubmission.load(args.true_prevalences)
-    pred_prev = ResultSubmission.load(args.pred_prevalences)
-
-    mrae, mae = evaluate_submission(true_prev, pred_prev, sample_size)
-    print(f'MRAE: {mrae:.4f}')
-    print(f'MAE: {mae:.4f}')
-
-    if args.output is not None:
-        with open(args.output, 'wt') as foo:
-            foo.write(f'MRAE: {mrae:.4f}\n')
-            foo.write(f'MAE: {mae:.4f}\n')
-
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser(description='LeQua2022 official evaluation script')
-    parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'],
-                        help='Task name (T1A, T1B, T2A, T2B)')
-    parser.add_argument('true_prevalences', metavar='TRUE-PREV-PATH', type=str,
-                        help='Path of ground truth prevalence values file (.csv)')
-    parser.add_argument('pred_prevalences', metavar='PRED-PREV-PATH', type=str,
-                        help='Path of predicted prevalence values file (.csv)')
-    parser.add_argument('--output', metavar='SCORES-PATH', type=str, default=None,
-                        help='Path where to store the evaluation scores')
-    args = parser.parse_args()
-
-    if args.output is not None:
-        qp.util.create_parent_dir(args.output)
-
-    main(args)
--- a/LeQua2022/format_checker.py
+++ b/LeQua2022/format_checker.py
@ -1,25 +0,0 @@
-import argparse
-from data import ResultSubmission
-
-
-"""
-LeQua2022 Official format-checker script 
-"""
-
-def main(args):
-    try:
-        ResultSubmission.check_file_format(args.prevalence_file)
-    except Exception as e:
-        print(e)
-        print('Format check: [not passed]')
-    else:
-        print('Format check: [passed]')
-
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser(description='LeQua2022 official format-checker script')
-    parser.add_argument('prevalence_file', metavar='PREVALENCEFILE-PATH', type=str,
-                        help='Path of the file containing prevalence values to check')
-    args = parser.parse_args()
-
-    main(args)
--- a/LeQua2022/predict.py
+++ b/LeQua2022/predict.py
@ -1,54 +0,0 @@
-import argparse
-import quapy as qp
-from data import ResultSubmission
-import os
-import pickle
-from tqdm import tqdm
-from data import gen_load_samples
-from glob import glob
-import constants
-
-"""
-LeQua2022 prediction script 
-"""
-
-def main(args):
-
-    # check the number of samples
-    nsamples = len(glob(os.path.join(args.samples, '*.txt')))
-    if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}:
-        print(f'Warning: The number of samples does neither coincide with the expected number of '
-              f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of '
-              f'test samples ({constants.TEST_SAMPLES}).')
-
-    # load pickled model
-    model = pickle.load(open(args.model, 'rb'))
-
-    # predictions
-    predictions = ResultSubmission()
-    for sampleid, sample in tqdm(gen_load_samples(args.samples, return_id=True, load_fn=), desc='predicting', total=nsamples):
-        predictions.add(sampleid, model.quantify(sample))
-
-    # saving
-    qp.util.create_parent_dir(args.output)
-    predictions.dump(args.output)
-
-
-if __name__=='__main__':
-    parser = argparse.ArgumentParser(description='LeQua2022 prediction script')
-    parser.add_argument('model', metavar='MODEL-PATH', type=str,
-                        help='Path of saved model')
-    parser.add_argument('samples', metavar='SAMPLES-PATH', type=str,
-                        help='Path to the directory containing the samples')
-    parser.add_argument('output', metavar='PREDICTIONS-PATH', type=str,
-                        help='Path where to store the predictions file')
-    parser.add_argument('nf', metavar='NUM-FEATURES', type=int,
-                        help='Number of features seen during training')
-    args = parser.parse_args()
-
-    if not os.path.exists(args.samples):
-        raise FileNotFoundError(f'path {args.samples} does not exist')
-    if not os.path.isdir(args.samples):
-        raise ValueError(f'path {args.samples} is not a valid directory')
-
-    main(args)