From 9a08125e7ea86e3de1bdc32985a6969263fa63fa Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Mon, 25 Oct 2021 13:37:22 +0200 Subject: [PATCH] evaluation script and format checker added --- LeQua2022/TODO.txt | 3 +- LeQua2022/constants.py | 6 ++ LeQua2022/data.py | 112 ++++++++++++++++++++------------ LeQua2022/evaluation.py | 41 ++++++++++++ LeQua2022/format_checker.py | 27 ++++++++ LeQua2022/main_binary_vector.py | 60 ++++++----------- quapy/data/preprocessing.py | 2 +- 7 files changed, 166 insertions(+), 85 deletions(-) create mode 100644 LeQua2022/constants.py create mode 100644 LeQua2022/evaluation.py create mode 100644 LeQua2022/format_checker.py diff --git a/LeQua2022/TODO.txt b/LeQua2022/TODO.txt index e51cf0d..2bc944e 100644 --- a/LeQua2022/TODO.txt +++ b/LeQua2022/TODO.txt @@ -5,4 +5,5 @@ 5. plots 6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y que de todos modos genera un output con el mismo nombre del file -7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly \ No newline at end of file +7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly +8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio diff --git a/LeQua2022/constants.py b/LeQua2022/constants.py new file mode 100644 index 0000000..1162e12 --- /dev/null +++ b/LeQua2022/constants.py @@ -0,0 +1,6 @@ +DEV_SAMPLES = 1000 +TEST_SAMPLES = 5000 + +T1A_SAMPLE_SIZE = 250 + +ERROR_TOL=1E-3 diff --git a/LeQua2022/data.py b/LeQua2022/data.py index 2d99120..815fc30 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -7,6 +7,9 @@ import quapy as qp import numpy as np import sklearn import re +from glob import glob + +import constants # def load_binary_raw_document(path): @@ -20,14 +23,38 @@ import re # def load_multiclass_raw_document(path): # return qp.data.from_text(path, verbose=0, class2int=False) +def load_category_map(path): + cat2code = {} + with open(path, 'rt') as fin: + category, code = fin.readline().split() + cat2code[category] = int(code) + return cat2code + def load_binary_vectors(path, nF=None): return sklearn.datasets.load_svmlight_file(path, n_features=nF) -def gen_load_samples_T1A(path_dir:str, ground_truth_path:str = None): - # for ... : yield - pass +def __gen_load_samples_with_groudtruth(path_dir:str, ground_truth_path:str, load_fn, **load_kwargs): + true_prevs = ResultSubmission.load(ground_truth_path) + for filename, prevalence in true_prevs.iterrows(): + sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs) + yield filename, sample, prevalence + + +def __gen_load_samples_without_groudtruth(path_dir:str, load_fn, **load_kwargs): + for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')): + sample, _ = load_fn(filepath, **load_kwargs) + yield os.path.basename(filepath), sample + + +def gen_load_samples_T1A(path_dir:str, nF:int, ground_truth_path:str = None): + if ground_truth_path is None: + for filename, sample in __gen_load_samples_without_groudtruth(path_dir, load_binary_vectors, nF=nF): + yield filename, sample + else: + for filename, sample, prevalence in __gen_load_samples_with_groudtruth(path_dir, ground_truth_path, load_binary_vectors, nF=nF): + yield filename, sample, prevalence def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None): @@ -46,9 +73,6 @@ def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None): class ResultSubmission: - DEV_LEN = 1000 - TEST_LEN = 5000 - ERROR_TOL = 1E-3 def __init__(self, categories: List[str]): if not isinstance(categories, list) or len(categories) < 2: @@ -80,9 +104,9 @@ class ResultSubmission: raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}') if (prevalence_values<0).any() or (prevalence_values>1).any(): raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_name}"') - if np.abs(prevalence_values.sum()-1) > ResultSubmission.ERROR_TOL: + if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL: raise ValueError(f'error: prevalence values do not sum up to one for "{sample_name}"' - f'(error tolerance {ResultSubmission.ERROR_TOL})') + f'(error tolerance {constants.ERROR_TOL})') new_entry = dict([('filename',sample_name)]+[(col_i,prev_i) for col_i, prev_i in zip(self.categories, prevalence_values)]) self.df = self.df.append(new_entry, ignore_index=True) @@ -93,7 +117,7 @@ class ResultSubmission: @classmethod def load(cls, path: str) -> 'ResultSubmission': df, inferred_type = ResultSubmission.check_file_format(path, return_inferred_type=True) - r = ResultSubmission(categories=df.columns.values.tolist()) + r = ResultSubmission(categories=df.columns.values[1:].tolist()) r.inferred_type = inferred_type r.df = df return r @@ -102,13 +126,19 @@ class ResultSubmission: ResultSubmission.check_dataframe_format(self.df) self.df.to_csv(path) - def get(self, sample_name:str): + def prevalence(self, sample_name:str): sel = self.df.loc[self.df['filename'] == sample_name] if sel.empty: return None else: return sel.loc[:,self.df.columns[1]:].values.flatten() + def iterrows(self): + for index, row in self.df.iterrows(): + filename = row.filename + prevalence = row[self.df.columns[1]:].values.flatten() + yield filename, prevalence + @classmethod def check_file_format(cls, path, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: df = pd.read_csv(path, index_col=0) @@ -116,7 +146,7 @@ class ResultSubmission: @classmethod def check_dataframe_format(cls, df, path=None, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: - hint_path = '' # if given, show the data path in the error messages + hint_path = '' # if given, show the data path in the error message if path is not None: hint_path = f' in {path}' @@ -125,33 +155,33 @@ class ResultSubmission: if df.empty: raise ValueError(f'error{hint_path}: results file is empty') - elif len(df) == ResultSubmission.DEV_LEN: + elif len(df) == constants.DEV_SAMPLES: inferred_type = 'dev' - expected_len = ResultSubmission.DEV_LEN - elif len(df) == ResultSubmission.TEST_LEN: + expected_len = constants.DEV_SAMPLES + elif len(df) == constants.TEST_SAMPLES: inferred_type = 'test' - expected_len = ResultSubmission.TEST_LEN + expected_len = constants.TEST_SAMPLES else: raise ValueError(f'wrong number of prevalence values found{hint_path}; ' - f'expected {ResultSubmission.DEV_LEN} for development sets and ' - f'{ResultSubmission.TEST_LEN} for test sets; found {len(df)}') + f'expected {constants.DEV_SAMPLES} for development sets and ' + f'{constants.TEST_SAMPLES} for test sets; found {len(df)}') set_names = frozenset(df.filename) for i in range(expected_len): if f'{inferred_type}_sample_{i}.txt' not in set_names: - raise ValueError(f'{hint_path} a file with {len(df)} entries is assumed to be of type ' + raise ValueError(f'error{hint_path} a file with {len(df)} entries is assumed to be of type ' f'"{inferred_type}" but entry {inferred_type}_sample_{i}.txt is missing ' f'(among perhaps many others)') for category_name in df.columns[1:]: if (df[category_name] < 0).any() or (df[category_name] > 1).any(): - raise ValueError(f'{hint_path} column "{category_name}" contains values out of range [0,1]') + raise ValueError(f'error{hint_path} column "{category_name}" contains values out of range [0,1]') prevs = df.loc[:, df.columns[1]:].values - round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ResultSubmission.ERROR_TOL + round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL if round_errors.any(): raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} ' - f'do not sum up to 1 (error tolerance {ResultSubmission.ERROR_TOL}), ' + f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), ' f'probably due to some rounding errors.') if return_inferred_type: @@ -163,20 +193,31 @@ class ResultSubmission: self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1) self.categories = sorted(self.categories) + def filenames(self): + return self.df.filename.values -def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=1000, average=True): +def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True): + if sample_size is None: + if qp.environ['SAMPLE_SIZE'] is None: + raise ValueError('Relative Absolute Error cannot be computed: ' + 'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified') + else: + sample_size = qp.environ['SAMPLE_SIZE'] + if len(true_prevs) != len(predicted_prevs): - raise ValueError(f'size mismatch, groun truth has {len(true_prevs)} entries ' - f'while predictions contain {len(predicted_prevs)} entries') + raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries ' + f'while the file of predictions contain {len(predicted_prevs)} entries') true_prevs.sort_categories() predicted_prevs.sort_categories() if true_prevs.categories != predicted_prevs.categories: - raise ValueError(f'these result files are not comparable since the categories are different') + raise ValueError(f'these result files are not comparable since the categories are different: ' + f'true={true_prevs.categories} vs. predictions={predicted_prevs.categories}') ae, rae = [], [] - for sample_name in true_prevs.df.filename.values: - ae.append(qp.error.mae(true_prevs.get(sample_name), predicted_prevs.get(sample_name))) - rae.append(qp.error.mrae(true_prevs.get(sample_name), predicted_prevs.get(sample_name), eps=sample_size)) + for sample_name, true_prevalence in true_prevs.iterrows(): + pred_prevalence = predicted_prevs.prevalence(sample_name) + ae.append(qp.error.ae(true_prevalence, pred_prevalence)) + rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size))) ae = np.asarray(ae) rae = np.asarray(rae) if average: @@ -187,21 +228,6 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub -# r = ResultSubmission(['negative', 'positive']) -# from tqdm import tqdm -# for i in tqdm(range(1000), total=1000): -# r.add(f'dev_sample_{i}.txt', np.asarray([0.5, 0.5])) -# r.dump('./path.csv') - -r = ResultSubmission.load('./data/T1A/public/dummy_submission.csv') -t = ResultSubmission.load('./data/T1A/public/dummy_submission (copy).csv') -# print(r.df) -# print(r.get('dev_sample_10.txt')) -print(evaluate_submission(r, t)) - -# s = ResultSubmission.load('./data/T1A/public/dummy_submission.csv') -# -# print(s) diff --git a/LeQua2022/evaluation.py b/LeQua2022/evaluation.py new file mode 100644 index 0000000..e56d6d5 --- /dev/null +++ b/LeQua2022/evaluation.py @@ -0,0 +1,41 @@ +import argparse +import quapy as qp +from data import ResultSubmission, evaluate_submission +import constants +import os + +""" +LeQua2022 Official evaluation script +""" + +def main(args): + if args.task in {'T1A'}: + qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE + true_prev = ResultSubmission.load(args.true_prevalences) + pred_prev = ResultSubmission.load(args.pred_prevalences) + mae, mrae = evaluate_submission(true_prev, pred_prev) + print(f'MAE: {mae:.4f}') + print(f'MRAE: {mrae:.4f}') + + if args.output is not None: + outdir = os.path.dirname(args.output) + if outdir: + os.makedirs(outdir, exist_ok=True) + with open(args.output, 'wt') as foo: + foo.write(f'MAE: {mae:.4f}\n') + foo.write(f'MRAE: {mrae:.4f}\n') + + +if __name__=='__main__': + parser = argparse.ArgumentParser(description='LeQua2022 official evaluation script') + parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'], + help='Task name (T1A, T1B, T2A, T2B)') + parser.add_argument('true_prevalences', metavar='TRUE-PREV-PATH', type=str, + help='Path of ground truth prevalence values file (.csv)') + parser.add_argument('pred_prevalences', metavar='PRED-PREV-PATH', type=str, + help='Path of predicted prevalence values file (.csv)') + parser.add_argument('--output', metavar='SCORES-PATH', type=str, default=None, + help='Path where to store the evaluation scores') + args = parser.parse_args() + + main(args) diff --git a/LeQua2022/format_checker.py b/LeQua2022/format_checker.py new file mode 100644 index 0000000..25f3e45 --- /dev/null +++ b/LeQua2022/format_checker.py @@ -0,0 +1,27 @@ +import argparse +import quapy as qp +from data import ResultSubmission, evaluate_submission +import constants +import os + +""" +LeQua2022 Official format-checker script +""" + +def main(args): + try: + ResultSubmission.check_file_format(args.prevalence_file) + except Exception as e: + print(e) + print('Format check: not passed') + else: + print('Format check: passed') + + +if __name__=='__main__': + parser = argparse.ArgumentParser(description='LeQua2022 official format-checker script') + parser.add_argument('prevalence_file', metavar='PREV-PATH', type=str, + help='Path of the file containing prevalence values to check') + args = parser.parse_args() + + main(args) diff --git a/LeQua2022/main_binary_vector.py b/LeQua2022/main_binary_vector.py index 2930091..c9e87a0 100644 --- a/LeQua2022/main_binary_vector.py +++ b/LeQua2022/main_binary_vector.py @@ -9,64 +9,44 @@ import quapy as qp from quapy.data import LabelledCollection from quapy.method.aggregative import * import quapy.functional as F -from data import load_binary_vectors +from data import * import os +import constants -path_binary_vector = './data/T1A' -result_path = os.path.join('results', 'T1A') # binary - vector -os.makedirs(result_path, exist_ok=True) +predictions_path = os.path.join('predictions', 'T1A') # binary - vector +os.makedirs(predictions_path, exist_ok=True) -train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt') - -train = LabelledCollection.load(train_file, load_binary_vectors) +pathT1A = './data/T1A/public' +T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors') +T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv') +T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt') +train = LabelledCollection.load(T1A_trainpath, load_binary_vectors) nF = train.instances.shape[1] +qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE + print(f'number of classes: {len(train.classes_)}') print(f'number of training documents: {len(train)}') print(f'training prevalence: {F.strprev(train.prevalence())}') print(f'training matrix shape: {train.instances.shape}') -dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0) -print(dev_prev) +true_prevalence = ResultSubmission.load(T1A_devprevalence_path) - -scores = {} -for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]: +for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: classifier = CalibratedClassifierCV(LogisticRegression()) model = quantifier(classifier).fit(train) quantifier_name = model.__class__.__name__ - scores[quantifier_name]={} - for sample_set, sample_size in [('dev', 1000)]: - ae_errors, rae_errors = [], [] - for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'): - filename = row['filename'] - prev_true = row[1:].values - sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename) - sample, _ = load_binary_vectors(sample_path, nF) - qp.environ['SAMPLE_SIZE'] = sample.shape[0] - prev_estim = model.quantify(sample) - # prev_true = sample.prevalence() - ae_errors.append(qp.error.mae(prev_true, prev_estim)) - rae_errors.append(qp.error.mrae(prev_true, prev_estim)) - - ae_errors = np.asarray(ae_errors) - rae_errors = np.asarray(rae_errors) - - mae = ae_errors.mean() - mrae = rae_errors.mean() - scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae} - pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) - pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) - print(f'{quantifier_name} {sample_set} MAE={mae:.4f}') - print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}') - -for model in scores: - for sample_set in ['validation']:#, 'test']: - print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}') + predictions = ResultSubmission(categories=['negative', 'positive']) + for samplename, sample in tqdm(gen_load_samples_T1A(T1A_devvectors_path, nF), + desc=quantifier_name, total=len(true_prevalence)): + predictions.add(samplename, model.quantify(sample)) + predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv')) + mae, mrae = evaluate_submission(true_prevalence, predictions) + print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}') """ test: diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index ee1627e..6e58718 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -149,7 +149,7 @@ class IndexTransformer: def index(self, documents): vocab = self.vocabulary_.copy() - return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] + return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] def fit_transform(self, X, n_jobs=-1): return self.fit(X).transform(X, n_jobs=n_jobs)