From cfdf2e35bd96159aa184628b634e059de9eeaf5d Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Wed, 15 Dec 2021 16:57:13 +0100 Subject: [PATCH] cleaning stuff from LeQua2022 branch --- LeQua2022/TODO.txt | 8 -- LeQua2022/_depr_baselines_T2.py | 117 ----------------- LeQua2022/baselines.py | 108 ---------------- LeQua2022/constants.py | 16 --- LeQua2022/data.py | 222 -------------------------------- LeQua2022/evaluate.py | 42 ------ LeQua2022/format_checker.py | 25 ---- LeQua2022/predict.py | 54 -------- 8 files changed, 592 deletions(-) delete mode 100644 LeQua2022/TODO.txt delete mode 100644 LeQua2022/_depr_baselines_T2.py delete mode 100644 LeQua2022/baselines.py delete mode 100644 LeQua2022/constants.py delete mode 100644 LeQua2022/data.py delete mode 100644 LeQua2022/evaluate.py delete mode 100644 LeQua2022/format_checker.py delete mode 100644 LeQua2022/predict.py diff --git a/LeQua2022/TODO.txt b/LeQua2022/TODO.txt deleted file mode 100644 index e0222df..0000000 --- a/LeQua2022/TODO.txt +++ /dev/null @@ -1,8 +0,0 @@ -2. tablas? -3. fetch dataset (download, unzip, etc.) -5. plots -11. Quitar las categorias como columnas de los ficheros de prevalences -12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc) -13. repair doc of GridSearchQ -14. reparar la calibracion en LR (lo tuve que quitar para que funcionara GridSearchQ, y lo quité en todos los ficheros) -15. podria poner que el eval_budget se usase en GridSearchQ con generator function para el progress bar de tqdm \ No newline at end of file diff --git a/LeQua2022/_depr_baselines_T2.py b/LeQua2022/_depr_baselines_T2.py deleted file mode 100644 index 2b02e7f..0000000 --- a/LeQua2022/_depr_baselines_T2.py +++ /dev/null @@ -1,117 +0,0 @@ -import argparse -import pickle - -from sklearn.decomposition import TruncatedSVD -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression as LR -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import StandardScaler - -from LeQua2022.pretrained_embeddings import TfidfWordEmbeddingTransformer, WordEmbeddingAverageTransformer -from LeQua2022.word_class_embeddings import WordClassEmbeddingsTransformer, ConcatenateEmbeddingsTransformer -from quapy.method.aggregative import * -from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE -import quapy.functional as F -from data import * -import os -import constants - - -# LeQua official baselines for task T1A (Binary/Vector) and T1B (Multiclass/Vector) -# ========================================================= - -def baselines(): - yield CC(LR(n_jobs=-1)), "CC" - # yield ACC(LR(n_jobs=-1)), "ACC" - # yield PCC(LR(n_jobs=-1)), "PCC" - # yield PACC(LR(n_jobs=-1)), "PACC" - # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" - # yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" - # yield MLPE(), "MLPE" - - -def main(args): - - models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task)) - - path_dev_vectors = os.path.join(args.datadir, 'dev_samples') - path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt') - path_train = os.path.join(args.datadir, 'training_data.txt') - - qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] - - train = LabelledCollection.load(path_train, load_raw_documents) - - tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2)) - train.instances = tfidf.fit_transform(*train.Xy) - - print(f'number of classes: {len(train.classes_)}') - print(f'number of training documents: {len(train)}') - print(f'training prevalence: {F.strprev(train.prevalence())}') - print(f'training matrix shape: {train.instances.shape}') - - # param_grid = { - # 'C': np.logspace(-3, 3, 7), - # 'class_weight': ['balanced', None] - # } - - param_grid = { - 'C': [1], - 'class_weight': ['balanced'] - } - - def gen_samples(): - return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, - load_fn=load_raw_unlabelled_documents, vectorizer=tfidf) - - outs = [] - for quantifier, q_name in baselines(): - print(f'{q_name}: Model selection') - quantifier = qp.model_selection.GridSearchQ( - quantifier, - param_grid, - sample_size=None, - protocol='gen', - error=qp.error.mrae, - refit=False, - verbose=True - ).fit(train, gen_samples) - - print(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})') - outs.append(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})') - - model_path = os.path.join(models_path, q_name+'.'+args.task+'.pkl') - print(f'saving model in {model_path}') - pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) - - print(outs) - with open(f'{args.mode}.{args.task}.txt', 'wt') as foo: - for line in outs: - foo.write(f'{line}\n') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='LeQua2022 Task T2A/T2B baselines') - parser.add_argument('task', metavar='TASK', type=str, choices=['T2A', 'T2B'], - help='Task name (T2A, T2B)') - parser.add_argument('datadir', metavar='DATA-PATH', type=str, - help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and ' - 'the directory "dev_documents"') - parser.add_argument('modeldir', metavar='MODEL-PATH', type=str, - help='Path where to save the models. ' - 'A subdirectory named will be automatically created.') - args = parser.parse_args() - - if not os.path.exists(args.datadir): - raise FileNotFoundError(f'path {args.datadir} does not exist') - if not os.path.isdir(args.datadir): - raise ValueError(f'path {args.datadir} is not a valid directory') - if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")): - raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file') - if not os.path.exists(os.path.join(args.datadir, "training_data.txt")): - raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file') - if not os.path.exists(os.path.join(args.datadir, "dev_samples")): - raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder') - - main(args) - diff --git a/LeQua2022/baselines.py b/LeQua2022/baselines.py deleted file mode 100644 index 28fd2d7..0000000 --- a/LeQua2022/baselines.py +++ /dev/null @@ -1,108 +0,0 @@ -import argparse -import pickle - -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression as LR -from quapy.method.aggregative import * -from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE -import quapy.functional as F -from data import * -import os -import constants - - -# LeQua official baselines for task T1A (Binary/Vector) and T1B (Multiclass/Vector) -# ========================================================= - -def baselines(): - yield CC(LR(n_jobs=-1)), "CC" - # yield ACC(LR(n_jobs=-1)), "ACC" - # yield PCC(LR(n_jobs=-1)), "PCC" - yield PACC(LR(n_jobs=-1)), "PACC" - yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" - # yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" - # yield MLPE(), "MLPE" - - -def main(args): - - models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task)) - - path_dev_vectors = os.path.join(args.datadir, 'dev_samples') - path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt') - path_train = os.path.join(args.datadir, 'training_data.txt') - - qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] - - if args.task in {'T1A', 'T1B'}: - train = LabelledCollection.load(path_train, load_vector_documents) - - def gen_samples(): - return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, load_fn=load_vector_documents) - else: - train = LabelledCollection.load(path_train, load_raw_documents) - tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1, 2)) - train.instances = tfidf.fit_transform(*train.Xy) - - def gen_samples(): - return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, - load_fn=load_raw_documents, vectorizer=tfidf) - - print(f'number of classes: {len(train.classes_)}') - print(f'number of training documents: {len(train)}') - print(f'training prevalence: {F.strprev(train.prevalence())}') - print(f'training matrix shape: {train.instances.shape}') - - param_grid = { - 'C': np.logspace(-3, 3, 7), - 'class_weight': ['balanced', None] - } - - param_grid = { - 'C': [0.01], - 'class_weight': ['balanced'] - } - - for quantifier, q_name in baselines(): - print(f'{q_name}: Model selection') - quantifier = qp.model_selection.GridSearchQ( - quantifier, - param_grid, - sample_size=None, - protocol='gen', - error=qp.error.mrae, - refit=False, - verbose=True - ).fit(train, gen_samples) - - print(f'{q_name} got MRAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})') - - model_path = os.path.join(models_path, q_name+'.pkl') - print(f'saving model in {model_path}') - pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='LeQua2022 baselines') - parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'], - help='Task name (T1A, T1B, T2A, T2B)') - parser.add_argument('datadir', metavar='DATA-PATH', type=str, - help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and ' - 'the directory "dev_samples"') - parser.add_argument('modeldir', metavar='MODEL-PATH', type=str, - help='Path where to save the models. ' - 'A subdirectory named will be automatically created.') - args = parser.parse_args() - - if not os.path.exists(args.datadir): - raise FileNotFoundError(f'path {args.datadir} does not exist') - if not os.path.isdir(args.datadir): - raise ValueError(f'path {args.datadir} is not a valid directory') - if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")): - raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file') - if not os.path.exists(os.path.join(args.datadir, "training_data.txt")): - raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file') - if not os.path.exists(os.path.join(args.datadir, "dev_samples")): - raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder') - - main(args) diff --git a/LeQua2022/constants.py b/LeQua2022/constants.py deleted file mode 100644 index 2c4bc77..0000000 --- a/LeQua2022/constants.py +++ /dev/null @@ -1,16 +0,0 @@ -DEV_SAMPLES = 1000 -TEST_SAMPLES = 5000 - -TXA_SAMPLE_SIZE = 250 -TXB_SAMPLE_SIZE = 1000 - -SAMPLE_SIZE={ - 'TXA': TXA_SAMPLE_SIZE, - 'TXB': TXB_SAMPLE_SIZE, - 'T1A': TXA_SAMPLE_SIZE, - 'T1B': TXB_SAMPLE_SIZE, - 'T2A': TXA_SAMPLE_SIZE, - 'T2B': TXB_SAMPLE_SIZE -} - -ERROR_TOL = 1E-3 diff --git a/LeQua2022/data.py b/LeQua2022/data.py deleted file mode 100644 index cf3587f..0000000 --- a/LeQua2022/data.py +++ /dev/null @@ -1,222 +0,0 @@ -import os.path -from typing import List, Tuple, Union - -import pandas as pd - -import quapy as qp -import numpy as np -import sklearn -import re -from glob import glob - -import constants - - -def load_category_map(path): - cat2code = {} - with open(path, 'rt') as fin: - for line in fin: - category, code = line.split() - cat2code[category] = int(code) - code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x:x[1])] - return cat2code, code2cat - - -def load_raw_documents(path, vectorizer=None): - df = pd.read_csv(path) - documents = list(df["text"].values) - if vectorizer: - documents = vectorizer.transform(documents) - labels = None - if "label" in df.columns: - labels = df["label"].values.astype(np.int) - return documents, labels - - -def load_vector_documents(path): - D = pd.read_csv(path).to_numpy(dtype=np.float) - labelled = D.shape[1] == 301 - if labelled: - X, y = D[:,1:], D[:,0].astype(np.int).flatten() - else: - X, y = D, None - return X, y - - -def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs): - true_prevs = ResultSubmission.load(ground_truth_path) - for id, prevalence in true_prevs.iterrows(): - sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) - yield (id, sample, prevalence) if return_id else (sample, prevalence) - - -def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs): - nsamples = len(glob(os.path.join(path_dir, f'*.txt'))) - for id in range(nsamples): - sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) - yield (id, sample) if return_id else sample - - -def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=False, load_fn=load_vector_documents, **load_kwargs): - if ground_truth_path is None: - # the generator function returns tuples (docid:str, sample:csr_matrix or str) - gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs) - else: - # the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray) - gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs) - for r in gen_fn: - yield r - - -class ResultSubmission: - - def __init__(self): - self.df = None - - def __init_df(self, categories:int): - if not isinstance(categories, int) or categories < 2: - raise TypeError('wrong format for categories: an int (>=2) was expected') - df = pd.DataFrame(columns=list(range(categories))) - df.index.set_names('id', inplace=True) - self.df = df - - @property - def n_categories(self): - return len(self.df.columns.values) - - def add(self, sample_id:int, prevalence_values:np.ndarray): - if not isinstance(sample_id, int): - raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}') - if not isinstance(prevalence_values, np.ndarray): - raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}') - if self.df is None: - self.__init_df(categories=len(prevalence_values)) - if sample_id in self.df.index.values: - raise ValueError(f'error: prevalence values for "{sample_id}" already added') - if prevalence_values.ndim!=1 and prevalence_values.size != self.n_categories: - raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}') - if (prevalence_values<0).any() or (prevalence_values>1).any(): - raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"') - if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL: - raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"' - f'(error tolerance {constants.ERROR_TOL})') - - self.df.loc[sample_id] = prevalence_values - - def __len__(self): - return len(self.df) - - @classmethod - def load(cls, path: str) -> 'ResultSubmission': - df = ResultSubmission.check_file_format(path) - r = ResultSubmission() - r.df = df - return r - - def dump(self, path:str): - ResultSubmission.check_dataframe_format(self.df) - self.df.to_csv(path) - - def prevalence(self, sample_id:int): - sel = self.df.loc[sample_id] - if sel.empty: - return None - else: - return sel.values.flatten() - - def iterrows(self): - for index, row in self.df.iterrows(): - prevalence = row.values.flatten() - yield index, prevalence - - @classmethod - def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: - try: - df = pd.read_csv(path, index_col=0) - except Exception as e: - print(f'the file {path} does not seem to be a valid csv file. ') - print(e) - return ResultSubmission.check_dataframe_format(df, path=path) - - @classmethod - def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: - hint_path = '' # if given, show the data path in the error message - if path is not None: - hint_path = f' in {path}' - - if df.index.name != 'id' or len(df.columns) < 2: - raise ValueError(f'wrong header{hint_path}, ' - f'the format of the header should be "id,0,...,n-1", ' - f'where n is the number of categories') - if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))): - raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n-1, ' - f'where n is the number of categories') - if df.empty: - raise ValueError(f'error{hint_path}: results file is empty') - elif len(df) != constants.DEV_SAMPLES and len(df) != constants.TEST_SAMPLES: - raise ValueError(f'wrong number of prevalence values found{hint_path}; ' - f'expected {constants.DEV_SAMPLES} for development sets and ' - f'{constants.TEST_SAMPLES} for test sets; found {len(df)}') - - ids = set(df.index.values) - expected_ids = set(range(len(df))) - if ids != expected_ids: - missing = expected_ids - ids - if missing: - raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}') - unexpected = ids - expected_ids - if unexpected: - raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}') - - for category_id in df.columns: - if (df[category_id] < 0).any() or (df[category_id] > 1).any(): - raise ValueError(f'error{hint_path} column "{category_id}" contains values out of range [0,1]') - - prevs = df.values - round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL - if round_errors.any(): - raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} ' - f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), ' - f'probably due to some rounding errors.') - - return df - - -def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True): - if sample_size is None: - if qp.environ['SAMPLE_SIZE'] is None: - raise ValueError('Relative Absolute Error cannot be computed: ' - 'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified') - else: - sample_size = qp.environ['SAMPLE_SIZE'] - - if len(true_prevs) != len(predicted_prevs): - raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries ' - f'while the file of predictions contain {len(predicted_prevs)} entries') - if true_prevs.n_categories != predicted_prevs.n_categories: - raise ValueError(f'these result files are not comparable since the categories are different: ' - f'true={true_prevs.n_categories} categories vs. ' - f'predictions={predicted_prevs.n_categories} categories') - rae, ae = [], [] - for sample_id, true_prevalence in true_prevs.iterrows(): - pred_prevalence = predicted_prevs.prevalence(sample_id) - rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size))) - ae.append(qp.error.ae(true_prevalence, pred_prevalence)) - - rae = np.asarray(rae) - ae = np.asarray(ae) - - if average: - return rae.mean(), ae.mean() - else: - return rae, ae - - - - - - - - - - diff --git a/LeQua2022/evaluate.py b/LeQua2022/evaluate.py deleted file mode 100644 index c21e368..0000000 --- a/LeQua2022/evaluate.py +++ /dev/null @@ -1,42 +0,0 @@ -import argparse -import quapy as qp -from data import ResultSubmission, evaluate_submission -import constants - -""" -LeQua2022 Official evaluation script -""" - -def main(args): - - sample_size = constants.SAMPLE_SIZE[args.task] - - true_prev = ResultSubmission.load(args.true_prevalences) - pred_prev = ResultSubmission.load(args.pred_prevalences) - - mrae, mae = evaluate_submission(true_prev, pred_prev, sample_size) - print(f'MRAE: {mrae:.4f}') - print(f'MAE: {mae:.4f}') - - if args.output is not None: - with open(args.output, 'wt') as foo: - foo.write(f'MRAE: {mrae:.4f}\n') - foo.write(f'MAE: {mae:.4f}\n') - - -if __name__=='__main__': - parser = argparse.ArgumentParser(description='LeQua2022 official evaluation script') - parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'], - help='Task name (T1A, T1B, T2A, T2B)') - parser.add_argument('true_prevalences', metavar='TRUE-PREV-PATH', type=str, - help='Path of ground truth prevalence values file (.csv)') - parser.add_argument('pred_prevalences', metavar='PRED-PREV-PATH', type=str, - help='Path of predicted prevalence values file (.csv)') - parser.add_argument('--output', metavar='SCORES-PATH', type=str, default=None, - help='Path where to store the evaluation scores') - args = parser.parse_args() - - if args.output is not None: - qp.util.create_parent_dir(args.output) - - main(args) diff --git a/LeQua2022/format_checker.py b/LeQua2022/format_checker.py deleted file mode 100644 index d9ae549..0000000 --- a/LeQua2022/format_checker.py +++ /dev/null @@ -1,25 +0,0 @@ -import argparse -from data import ResultSubmission - - -""" -LeQua2022 Official format-checker script -""" - -def main(args): - try: - ResultSubmission.check_file_format(args.prevalence_file) - except Exception as e: - print(e) - print('Format check: [not passed]') - else: - print('Format check: [passed]') - - -if __name__=='__main__': - parser = argparse.ArgumentParser(description='LeQua2022 official format-checker script') - parser.add_argument('prevalence_file', metavar='PREVALENCEFILE-PATH', type=str, - help='Path of the file containing prevalence values to check') - args = parser.parse_args() - - main(args) diff --git a/LeQua2022/predict.py b/LeQua2022/predict.py deleted file mode 100644 index b014468..0000000 --- a/LeQua2022/predict.py +++ /dev/null @@ -1,54 +0,0 @@ -import argparse -import quapy as qp -from data import ResultSubmission -import os -import pickle -from tqdm import tqdm -from data import gen_load_samples -from glob import glob -import constants - -""" -LeQua2022 prediction script -""" - -def main(args): - - # check the number of samples - nsamples = len(glob(os.path.join(args.samples, '*.txt'))) - if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}: - print(f'Warning: The number of samples does neither coincide with the expected number of ' - f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of ' - f'test samples ({constants.TEST_SAMPLES}).') - - # load pickled model - model = pickle.load(open(args.model, 'rb')) - - # predictions - predictions = ResultSubmission() - for sampleid, sample in tqdm(gen_load_samples(args.samples, return_id=True, load_fn=), desc='predicting', total=nsamples): - predictions.add(sampleid, model.quantify(sample)) - - # saving - qp.util.create_parent_dir(args.output) - predictions.dump(args.output) - - -if __name__=='__main__': - parser = argparse.ArgumentParser(description='LeQua2022 prediction script') - parser.add_argument('model', metavar='MODEL-PATH', type=str, - help='Path of saved model') - parser.add_argument('samples', metavar='SAMPLES-PATH', type=str, - help='Path to the directory containing the samples') - parser.add_argument('output', metavar='PREDICTIONS-PATH', type=str, - help='Path where to store the predictions file') - parser.add_argument('nf', metavar='NUM-FEATURES', type=int, - help='Number of features seen during training') - args = parser.parse_args() - - if not os.path.exists(args.samples): - raise FileNotFoundError(f'path {args.samples} does not exist') - if not os.path.isdir(args.samples): - raise ValueError(f'path {args.samples} is not a valid directory') - - main(args)