diff --git a/LeQua2022/TODO.txt b/LeQua2022/TODO.txt index e51cf0d..1e16136 100644 --- a/LeQua2022/TODO.txt +++ b/LeQua2022/TODO.txt @@ -1,8 +1,13 @@ -1. los test hay que hacerlos suponiendo que las etiquetas no existen, es decir, viendo los resultados en los ficheros "prevalences" (renominar) 2. tablas? 3. fetch dataset (download, unzip, etc.) 4. model selection 5. plots -6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y - que de todos modos genera un output con el mismo nombre del file -7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly \ No newline at end of file +8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio +9. Experimentar con vectores densos (PCA sobre tfidf por ejemplo) +10. Si cambiamos el formato de los samples (por ejemplo, en lugar de svmlight con .txt a PCA con .dat) hay que cambiar + cosas en el código. Está escrito varias veces un glob(*.txt) +11. Quitar las categorias como columnas de los ficheros de prevalences +12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc) +13. repair doc of GridSearchQ +14. reparar la calibracion en LR (lo tuve que quitar para que funcionara GridSearchQ, y lo quité en todos los ficheros) +15. podria poner que el eval_budget se usase en GridSearchQ con generator function para el progress bar de tqdm \ No newline at end of file diff --git a/LeQua2022/baselinesSVD_T1A.py b/LeQua2022/baselinesSVD_T1A.py new file mode 100644 index 0000000..c0fdc15 --- /dev/null +++ b/LeQua2022/baselinesSVD_T1A.py @@ -0,0 +1,84 @@ +import pickle + +import numpy as np +from sklearn.linear_model import LogisticRegression +from tqdm import tqdm +import pandas as pd + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.aggregative import * +import quapy.functional as F +from data import * +import os +import constants + +from sklearn.decomposition import TruncatedSVD + + +# LeQua official baselines for task T1A (Binary/Vector) +# ===================================================== + +predictions_path = os.path.join('predictions', 'T1A') +os.makedirs(predictions_path, exist_ok=True) + +models_path = os.path.join('models', 'T1A') +os.makedirs(models_path, exist_ok=True) + +pathT1A = './data/T1A/public' +T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors') +T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv') +T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt') + +train = LabelledCollection.load(T1A_trainpath, load_binary_vectors) +nF = train.instances.shape[1] +svd = TruncatedSVD(n_components=300) +train.instances = svd.fit_transform(train.instances) + +qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE + +print(f'number of classes: {len(train.classes_)}') +print(f'number of training documents: {len(train)}') +print(f'training prevalence: {F.strprev(train.prevalence())}') +print(f'training matrix shape: {train.instances.shape}') + +true_prevalence = ResultSubmission.load(T1A_devprevalence_path) + +for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: + + # classifier = CalibratedClassifierCV(LogisticRegression()) + classifier = LogisticRegression() + model = quantifier(classifier).fit(train) + quantifier_name = model.__class__.__name__ + + predictions = ResultSubmission(categories=['negative', 'positive']) + for samplename, sample in tqdm(gen_load_samples_T1(T1A_devvectors_path, nF), + desc=quantifier_name, total=len(true_prevalence)): + sample = svd.transform(sample) + predictions.add(samplename, model.quantify(sample)) + + predictions.dump(os.path.join(predictions_path, quantifier_name + '.svd.csv')) + pickle.dump(model, open(os.path.join(models_path, quantifier_name+'.svd.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL) + + mae, mrae = evaluate_submission(true_prevalence, predictions) + print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}') + +""" +test: +CC 0.1859 1.5406 +ACC 0.0453 0.2840 +PCC 0.1793 1.7187 +PACC 0.0287 0.1494 +EMQ 0.0225 0.1020 +HDy 0.0631 0.2307 + +validation +CC 0.1862 1.9587 +ACC 0.0394 0.2669 +PCC 0.1789 2.1383 +PACC 0.0354 0.1587 +EMQ 0.0224 0.0960 +HDy 0.0467 0.2121 +""" + + diff --git a/LeQua2022/baselines_T1A.py b/LeQua2022/baselines_T1A.py new file mode 100644 index 0000000..179995c --- /dev/null +++ b/LeQua2022/baselines_T1A.py @@ -0,0 +1,79 @@ +import pickle + +import numpy as np +from sklearn.linear_model import LogisticRegression +from tqdm import tqdm +import pandas as pd + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.aggregative import * +import quapy.functional as F +from data import * +import os +import constants + + +# LeQua official baselines for task T1A (Binary/Vector) +# ===================================================== + +predictions_path = os.path.join('predictions', 'T1A') +os.makedirs(predictions_path, exist_ok=True) + +models_path = os.path.join('models', 'T1A') +os.makedirs(models_path, exist_ok=True) + +pathT1A = './data/T1A/public' +T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors') +T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv') +T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt') + +train = LabelledCollection.load(T1A_trainpath, load_binary_vectors) +nF = train.instances.shape[1] + +qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE + +print(f'number of classes: {len(train.classes_)}') +print(f'number of training documents: {len(train)}') +print(f'training prevalence: {F.strprev(train.prevalence())}') +print(f'training matrix shape: {train.instances.shape}') + +true_prevalence = ResultSubmission.load(T1A_devprevalence_path) + +for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: + + # classifier = CalibratedClassifierCV(LogisticRegression(C=1)) + classifier = LogisticRegression(C=1) + model = quantifier(classifier).fit(train) + quantifier_name = model.__class__.__name__ + + predictions = ResultSubmission(categories=['negative', 'positive']) + for samplename, sample in tqdm(gen_load_samples_T1(T1A_devvectors_path, nF), + desc=quantifier_name, total=len(true_prevalence)): + predictions.add(samplename, model.quantify(sample)) + + predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv')) + pickle.dump(model, open(os.path.join(models_path, quantifier_name+'.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL) + + mae, mrae = evaluate_submission(true_prevalence, predictions) + print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}') + +""" +test: +CC 0.1859 1.5406 +ACC 0.0453 0.2840 +PCC 0.1793 1.7187 +PACC 0.0287 0.1494 +EMQ 0.0225 0.1020 +HDy 0.0631 0.2307 + +validation +CC 0.1862 1.9587 +ACC 0.0394 0.2669 +PCC 0.1789 2.1383 +PACC 0.0354 0.1587 +EMQ 0.0224 0.0960 +HDy 0.0467 0.2121 +""" + + diff --git a/LeQua2022/baselines_T1Amodsel.py b/LeQua2022/baselines_T1Amodsel.py new file mode 100644 index 0000000..c312135 --- /dev/null +++ b/LeQua2022/baselines_T1Amodsel.py @@ -0,0 +1,91 @@ +import pickle + +import numpy as np +from sklearn.linear_model import LogisticRegression +from tqdm import tqdm +import pandas as pd + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.aggregative import * +import quapy.functional as F +from data import * +import os +import constants + + +# LeQua official baselines for task T1A (Binary/Vector) +# ===================================================== + +predictions_path = os.path.join('predictions', 'T1A') +os.makedirs(predictions_path, exist_ok=True) + +models_path = os.path.join('models', 'T1A') +os.makedirs(models_path, exist_ok=True) + +pathT1A = './data/T1A/public' +T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors') +T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv') +T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt') + +train = LabelledCollection.load(T1A_trainpath, load_binary_vectors) +nF = train.instances.shape[1] + +qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE + +print(f'number of classes: {len(train.classes_)}') +print(f'number of training documents: {len(train)}') +print(f'training prevalence: {F.strprev(train.prevalence())}') +print(f'training matrix shape: {train.instances.shape}') + +true_prevalence = ResultSubmission.load(T1A_devprevalence_path) + +param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]} + + +def gen_samples(): + return gen_load_samples_T1(T1A_devvectors_path, nF, ground_truth_path=T1A_devprevalence_path, return_filename=False) + + +for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: + #classifier = CalibratedClassifierCV(LogisticRegression(), n_jobs=-1) + classifier = LogisticRegression() + model = quantifier(classifier) + print(f'{model.__class__.__name__}: Model selection') + model = qp.model_selection.GridSearchQ( + model, + param_grid, + sample_size=None, + protocol='gen', + error=qp.error.mae, + refit=False, + verbose=True + ).fit(train, gen_samples) + + quantifier_name = model.best_model().__class__.__name__ + print(f'{quantifier_name} mae={model.best_score_:.3f} (params: {model.best_params_})') + + pickle.dump(model.best_model(), + open(os.path.join(models_path, quantifier_name+'.modsel.pkl'), 'wb'), + protocol=pickle.HIGHEST_PROTOCOL) + + +""" +test: +CC 0.1859 1.5406 +ACC 0.0453 0.2840 +PCC 0.1793 1.7187 +PACC 0.0287 0.1494 +EMQ 0.0225 0.1020 +HDy 0.0631 0.2307 + +validation +CC 0.1862 1.9587 +ACC 0.0394 0.2669 +PCC 0.1789 2.1383 +PACC 0.0354 0.1587 +EMQ 0.0224 0.0960 +HDy 0.0467 0.2121 +""" + + diff --git a/LeQua2022/baselines_T1B.py b/LeQua2022/baselines_T1B.py new file mode 100644 index 0000000..1344bbc --- /dev/null +++ b/LeQua2022/baselines_T1B.py @@ -0,0 +1,55 @@ +import pickle + +import numpy as np +from sklearn.linear_model import LogisticRegression +from tqdm import tqdm +import pandas as pd + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.aggregative import * +import quapy.functional as F +from data import * +import os +import constants + +predictions_path = os.path.join('predictions', 'T1B') # multiclass - vector +os.makedirs(predictions_path, exist_ok=True) + +pathT1B = './data/T1B/public' +T1B_devvectors_path = os.path.join(pathT1B, 'dev_vectors') +T1B_devprevalence_path = os.path.join(pathT1B, 'dev_prevalences.csv') +T1B_trainpath = os.path.join(pathT1B, 'training_vectors.txt') +T1B_catmap = os.path.join(pathT1B, 'training_vectors_label_map.txt') + +train = LabelledCollection.load(T1B_trainpath, load_binary_vectors) +nF = train.instances.shape[1] + +qp.environ['SAMPLE_SIZE'] = constants.T1B_SAMPLE_SIZE + +print(f'number of classes: {len(train.classes_)}') +print(f'number of training documents: {len(train)}') +print(f'training prevalence: {F.strprev(train.prevalence())}') +print(f'training matrix shape: {train.instances.shape}') + +true_prevalence = ResultSubmission.load(T1B_devprevalence_path) + +cat2code, categories = load_category_map(T1B_catmap) + +for quantifier in [PACC]: # [CC, ACC, PCC, PACC, EMQ]: + + classifier = CalibratedClassifierCV(LogisticRegression()) + model = quantifier(classifier).fit(train) + quantifier_name = model.__class__.__name__ + + predictions = ResultSubmission(categories=categories) + for samplename, sample in tqdm(gen_load_samples_T1(T1B_devvectors_path, nF), + desc=quantifier_name, total=len(true_prevalence)): + predictions.add(samplename, model.quantify(sample)) + + predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv')) + mae, mrae = evaluate_submission(true_prevalence, predictions) + print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}') + + + diff --git a/LeQua2022/constants.py b/LeQua2022/constants.py new file mode 100644 index 0000000..dee7f8c --- /dev/null +++ b/LeQua2022/constants.py @@ -0,0 +1,7 @@ +DEV_SAMPLES = 1000 +TEST_SAMPLES = 5000 + +T1A_SAMPLE_SIZE = 250 +T1B_SAMPLE_SIZE = 1000 + +ERROR_TOL = 1E-3 diff --git a/LeQua2022/data.py b/LeQua2022/data.py index be3d4ff..5068f47 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -7,6 +7,9 @@ import quapy as qp import numpy as np import sklearn import re +from glob import glob + +import constants # def load_binary_raw_document(path): @@ -20,19 +23,48 @@ import re # def load_multiclass_raw_document(path): # return qp.data.from_text(path, verbose=0, class2int=False) +def load_category_map(path): + cat2code = {} + with open(path, 'rt') as fin: + for line in fin: + category, code = line.split() + cat2code[category] = int(code) + code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x:x[1])] + return cat2code, code2cat + def load_binary_vectors(path, nF=None): return sklearn.datasets.load_svmlight_file(path, n_features=nF) -def gen_load_samples_T1A(path_dir:str, ground_truth_path:str = None): - # for ... : yield - pass +def __gen_load_samples_with_groudtruth(path_dir:str, return_filename:bool, ground_truth_path:str, load_fn, **load_kwargs): + true_prevs = ResultSubmission.load(ground_truth_path) + for filename, prevalence in true_prevs.iterrows(): + sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs) + if return_filename: + yield filename, sample, prevalence + else: + yield sample, prevalence -def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None): - # for ... : yield - pass +def __gen_load_samples_without_groudtruth(path_dir:str, return_filename:bool, load_fn, **load_kwargs): + for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')): + sample, _ = load_fn(filepath, **load_kwargs) + if return_filename: + yield os.path.basename(filepath), sample + else: + yield sample + + +def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_filename=True): + if ground_truth_path is None: + # the generator function returns tuples (filename:str, sample:csr_matrix) + gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_filename, load_binary_vectors, nF=nF) + else: + # the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray) + gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_filename, ground_truth_path, load_binary_vectors, nF=nF) + for r in gen_fn: + yield r def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None): @@ -46,9 +78,6 @@ def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None): class ResultSubmission: - DEV_LEN = 1000 - TEST_LEN = 5000 - ERROR_TOL = 1E-3 def __init__(self, categories: List[str]): if not isinstance(categories, list) or len(categories) < 2: @@ -80,9 +109,9 @@ class ResultSubmission: raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}') if (prevalence_values<0).any() or (prevalence_values>1).any(): raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_name}"') - if np.abs(prevalence_values.sum()-1) > ResultSubmission.ERROR_TOL: + if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL: raise ValueError(f'error: prevalence values do not sum up to one for "{sample_name}"' - f'(error tolerance {ResultSubmission.ERROR_TOL})') + f'(error tolerance {constants.ERROR_TOL})') new_entry = dict([('filename',sample_name)]+[(col_i,prev_i) for col_i, prev_i in zip(self.categories, prevalence_values)]) self.df = self.df.append(new_entry, ignore_index=True) @@ -93,7 +122,7 @@ class ResultSubmission: @classmethod def load(cls, path: str) -> 'ResultSubmission': df, inferred_type = ResultSubmission.check_file_format(path, return_inferred_type=True) - r = ResultSubmission(categories=df.columns.values.tolist()) + r = ResultSubmission(categories=df.columns.values[1:].tolist()) r.inferred_type = inferred_type r.df = df return r @@ -102,13 +131,19 @@ class ResultSubmission: ResultSubmission.check_dataframe_format(self.df) self.df.to_csv(path) - def get(self, sample_name:str): + def prevalence(self, sample_name:str): sel = self.df.loc[self.df['filename'] == sample_name] if sel.empty: return None else: return sel.loc[:,self.df.columns[1]:].values.flatten() + def iterrows(self): + for index, row in self.df.iterrows(): + filename = row.filename + prevalence = row[self.df.columns[1]:].values.flatten() + yield filename, prevalence + @classmethod def check_file_format(cls, path, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: df = pd.read_csv(path, index_col=0) @@ -116,7 +151,7 @@ class ResultSubmission: @classmethod def check_dataframe_format(cls, df, path=None, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: - hint_path = '' # if given, show the data path in the error messages + hint_path = '' # if given, show the data path in the error message if path is not None: hint_path = f' in {path}' @@ -125,33 +160,33 @@ class ResultSubmission: if df.empty: raise ValueError(f'error{hint_path}: results file is empty') - elif len(df) == ResultSubmission.DEV_LEN: + elif len(df) == constants.DEV_SAMPLES: inferred_type = 'dev' - expected_len = ResultSubmission.DEV_LEN - elif len(df) == ResultSubmission.TEST_LEN: + expected_len = constants.DEV_SAMPLES + elif len(df) == constants.TEST_SAMPLES: inferred_type = 'test' - expected_len = ResultSubmission.TEST_LEN + expected_len = constants.TEST_SAMPLES else: raise ValueError(f'wrong number of prevalence values found{hint_path}; ' - f'expected {ResultSubmission.DEV_LEN} for development sets and ' - f'{ResultSubmission.TEST_LEN} for test sets; found {len(df)}') + f'expected {constants.DEV_SAMPLES} for development sets and ' + f'{constants.TEST_SAMPLES} for test sets; found {len(df)}') set_names = frozenset(df.filename) for i in range(expected_len): if f'{inferred_type}_sample_{i}.txt' not in set_names: - raise ValueError(f'{hint_path} a file with {len(df)} entries is assumed to be of type ' + raise ValueError(f'error{hint_path} a file with {len(df)} entries is assumed to be of type ' f'"{inferred_type}" but entry {inferred_type}_sample_{i}.txt is missing ' f'(among perhaps many others)') for category_name in df.columns[1:]: if (df[category_name] < 0).any() or (df[category_name] > 1).any(): - raise ValueError(f'{hint_path} column "{category_name}" contains values out of range [0,1]') + raise ValueError(f'error{hint_path} column "{category_name}" contains values out of range [0,1]') prevs = df.loc[:, df.columns[1]:].values - round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ResultSubmission.ERROR_TOL + round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL if round_errors.any(): raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} ' - f'do not sum up to 1 (error tolerance {ResultSubmission.ERROR_TOL}), ' + f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), ' f'probably due to some rounding errors.') if return_inferred_type: @@ -163,20 +198,31 @@ class ResultSubmission: self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1) self.categories = sorted(self.categories) + def filenames(self): + return self.df.filename.values -def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=1000, average=True): +def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True): + if sample_size is None: + if qp.environ['SAMPLE_SIZE'] is None: + raise ValueError('Relative Absolute Error cannot be computed: ' + 'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified') + else: + sample_size = qp.environ['SAMPLE_SIZE'] + if len(true_prevs) != len(predicted_prevs): - raise ValueError(f'size mismatch, groun truth has {len(true_prevs)} entries ' - f'while predictions contain {len(predicted_prevs)} entries') + raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries ' + f'while the file of predictions contain {len(predicted_prevs)} entries') true_prevs.sort_categories() predicted_prevs.sort_categories() if true_prevs.categories != predicted_prevs.categories: - raise ValueError(f'these result files are not comparable since the categories are different') + raise ValueError(f'these result files are not comparable since the categories are different: ' + f'true={true_prevs.categories} vs. predictions={predicted_prevs.categories}') ae, rae = [], [] - for sample_name in true_prevs.df.filename.values: - ae.append(qp.error.mae(true_prevs.get(sample_name), predicted_prevs.get(sample_name))) - rae.append(qp.error.mrae(true_prevs.get(sample_name), predicted_prevs.get(sample_name), eps=sample_size)) + for sample_name, true_prevalence in true_prevs.iterrows(): + pred_prevalence = predicted_prevs.prevalence(sample_name) + ae.append(qp.error.ae(true_prevalence, pred_prevalence)) + rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size))) ae = np.asarray(ae) rae = np.asarray(rae) if average: @@ -187,21 +233,6 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub -# r = ResultSubmission(['negative', 'positive']) -# from tqdm import tqdm -# for i in tqdm(range(1000), total=1000): -# r.add(f'dev_sample_{i}.txt', np.asarray([0.5, 0.5])) -# r.dump('./path.csv') - -# r = ResultSubmission.load('./data/T1A/public/dummy_submission.csv') -# t = ResultSubmission.load('./data/T1A/public/dummy_submission (copy).csv') -# print(r.df) -# print(r.get('dev_sample_10.txt')) -# print(evaluate_submission(r, t)) - -# s = ResultSubmission.load('./data/T1A/public/dummy_submission.csv') -# -# print(s) diff --git a/LeQua2022/evaluation.py b/LeQua2022/evaluation.py new file mode 100644 index 0000000..e56d6d5 --- /dev/null +++ b/LeQua2022/evaluation.py @@ -0,0 +1,41 @@ +import argparse +import quapy as qp +from data import ResultSubmission, evaluate_submission +import constants +import os + +""" +LeQua2022 Official evaluation script +""" + +def main(args): + if args.task in {'T1A'}: + qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE + true_prev = ResultSubmission.load(args.true_prevalences) + pred_prev = ResultSubmission.load(args.pred_prevalences) + mae, mrae = evaluate_submission(true_prev, pred_prev) + print(f'MAE: {mae:.4f}') + print(f'MRAE: {mrae:.4f}') + + if args.output is not None: + outdir = os.path.dirname(args.output) + if outdir: + os.makedirs(outdir, exist_ok=True) + with open(args.output, 'wt') as foo: + foo.write(f'MAE: {mae:.4f}\n') + foo.write(f'MRAE: {mrae:.4f}\n') + + +if __name__=='__main__': + parser = argparse.ArgumentParser(description='LeQua2022 official evaluation script') + parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'], + help='Task name (T1A, T1B, T2A, T2B)') + parser.add_argument('true_prevalences', metavar='TRUE-PREV-PATH', type=str, + help='Path of ground truth prevalence values file (.csv)') + parser.add_argument('pred_prevalences', metavar='PRED-PREV-PATH', type=str, + help='Path of predicted prevalence values file (.csv)') + parser.add_argument('--output', metavar='SCORES-PATH', type=str, default=None, + help='Path where to store the evaluation scores') + args = parser.parse_args() + + main(args) diff --git a/LeQua2022/format_checker.py b/LeQua2022/format_checker.py new file mode 100644 index 0000000..25f3e45 --- /dev/null +++ b/LeQua2022/format_checker.py @@ -0,0 +1,27 @@ +import argparse +import quapy as qp +from data import ResultSubmission, evaluate_submission +import constants +import os + +""" +LeQua2022 Official format-checker script +""" + +def main(args): + try: + ResultSubmission.check_file_format(args.prevalence_file) + except Exception as e: + print(e) + print('Format check: not passed') + else: + print('Format check: passed') + + +if __name__=='__main__': + parser = argparse.ArgumentParser(description='LeQua2022 official format-checker script') + parser.add_argument('prevalence_file', metavar='PREV-PATH', type=str, + help='Path of the file containing prevalence values to check') + args = parser.parse_args() + + main(args) diff --git a/LeQua2022/main_binary_vector.py b/LeQua2022/main_binary_vector.py deleted file mode 100644 index 2930091..0000000 --- a/LeQua2022/main_binary_vector.py +++ /dev/null @@ -1,89 +0,0 @@ -import pickle - -import numpy as np -from sklearn.linear_model import LogisticRegression -from tqdm import tqdm -import pandas as pd - -import quapy as qp -from quapy.data import LabelledCollection -from quapy.method.aggregative import * -import quapy.functional as F -from data import load_binary_vectors -import os - -path_binary_vector = './data/T1A' -result_path = os.path.join('results', 'T1A') # binary - vector -os.makedirs(result_path, exist_ok=True) - -train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt') - -train = LabelledCollection.load(train_file, load_binary_vectors) - -nF = train.instances.shape[1] - -print(f'number of classes: {len(train.classes_)}') -print(f'number of training documents: {len(train)}') -print(f'training prevalence: {F.strprev(train.prevalence())}') -print(f'training matrix shape: {train.instances.shape}') - -dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0) -print(dev_prev) - - -scores = {} -for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]: - - classifier = CalibratedClassifierCV(LogisticRegression()) - model = quantifier(classifier).fit(train) - quantifier_name = model.__class__.__name__ - - scores[quantifier_name]={} - for sample_set, sample_size in [('dev', 1000)]: - ae_errors, rae_errors = [], [] - for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'): - filename = row['filename'] - prev_true = row[1:].values - sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename) - sample, _ = load_binary_vectors(sample_path, nF) - qp.environ['SAMPLE_SIZE'] = sample.shape[0] - prev_estim = model.quantify(sample) - # prev_true = sample.prevalence() - ae_errors.append(qp.error.mae(prev_true, prev_estim)) - rae_errors.append(qp.error.mrae(prev_true, prev_estim)) - - ae_errors = np.asarray(ae_errors) - rae_errors = np.asarray(rae_errors) - - mae = ae_errors.mean() - mrae = rae_errors.mean() - scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae} - pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) - pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) - print(f'{quantifier_name} {sample_set} MAE={mae:.4f}') - print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}') - -for model in scores: - for sample_set in ['validation']:#, 'test']: - print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}') - - -""" -test: -CC 0.1859 1.5406 -ACC 0.0453 0.2840 -PCC 0.1793 1.7187 -PACC 0.0287 0.1494 -EMQ 0.0225 0.1020 -HDy 0.0631 0.2307 - -validation -CC 0.1862 1.9587 -ACC 0.0394 0.2669 -PCC 0.1789 2.1383 -PACC 0.0354 0.1587 -EMQ 0.0224 0.0960 -HDy 0.0467 0.2121 -""" - - diff --git a/LeQua2022/predict.py b/LeQua2022/predict.py new file mode 100644 index 0000000..c6285cb --- /dev/null +++ b/LeQua2022/predict.py @@ -0,0 +1,62 @@ +import argparse +import quapy as qp +from data import ResultSubmission, evaluate_submission +import constants +import os +import pickle +from tqdm import tqdm +from data import gen_load_samples_T1, load_category_map +from glob import glob +import constants + +""" +LeQua2022 prediction script +""" + +def main(args): + + # check the number of samples + nsamples = len(glob(os.path.join(args.samples, '*.txt'))) + if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}: + print(f'Warning: The number of samples does neither coincide with the expected number of ' + f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of ' + f'test samples ({constants.TEST_SAMPLES}).') + + _, categories = load_category_map(args.catmap) + + # load pickled model + model = pickle.load(open(args.model, 'rb')) + + # predictions + predictions = ResultSubmission(categories=categories) + for samplename, sample in tqdm(gen_load_samples_T1(args.samples, args.nf), + desc='predicting', total=nsamples): + predictions.add(samplename, model.quantify(sample)) + + # saving + basedir = os.path.basename(args.output) + if basedir: + os.makedirs(basedir, exist_ok=True) + predictions.dump(args.output) + + +if __name__=='__main__': + parser = argparse.ArgumentParser(description='LeQua2022 prediction script') + parser.add_argument('model', metavar='MODEL-PATH', type=str, + help='Path of saved model') + parser.add_argument('samples', metavar='SAMPLES-PATH', type=str, + help='Path to the directory containing the samples') + parser.add_argument('output', metavar='PREDICTIONS-PATH', type=str, + help='Path where to store the predictions file') + parser.add_argument('catmap', metavar='CATEGORY-MAP-PATH', type=str, + help='Path to the category map file') + parser.add_argument('nf', metavar='NUM-FEATURES', type=int, + help='Number of features seen during training') + args = parser.parse_args() + + if not os.path.exists(args.samples): + raise FileNotFoundError(f'path {args.samples} does not exist') + if not os.path.isdir(args.samples): + raise ValueError(f'path {args.samples} is not a valid directory') + + main(args) diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index ee1627e..6e58718 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -149,7 +149,7 @@ class IndexTransformer: def index(self, documents): vocab = self.vocabulary_.copy() - return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] + return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] def fit_transform(self, X, n_jobs=-1): return self.fit(X).transform(X, n_jobs=n_jobs) diff --git a/quapy/evaluation.py b/quapy/evaluation.py index ebdb537..42ecf01 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -9,6 +9,7 @@ from quapy.method.base import BaseQuantifier from quapy.util import temp_seed import quapy.functional as F import pandas as pd +import inspect def artificial_prevalence_prediction( @@ -78,6 +79,27 @@ def natural_prevalence_prediction( return _predict_from_indexes(indexes, model, test, n_jobs, verbose) +def gen_prevalence_prediction(model: BaseQuantifier, gen_fn: Callable, eval_budget=None): + if not inspect.isgenerator(gen_fn()): + raise ValueError('param "gen_fun" is not a generator') + + if not isinstance(eval_budget, int): + eval_budget = -1 + + true_prevalences, estim_prevalences = [], [] + for sample_instances, true_prev in gen_fn(): + true_prevalences.append(true_prev) + estim_prevalences.append(model.quantify(sample_instances)) + eval_budget -= 1 + if eval_budget == 0: + break + + true_prevalences = np.asarray(true_prevalences) + estim_prevalences = np.asarray(estim_prevalences) + + return true_prevalences, estim_prevalences + + def _predict_from_indexes( indexes, model: BaseQuantifier, diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 1080db0..95c6ff8 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -5,8 +5,9 @@ from typing import Union, Callable import quapy as qp from quapy.data.base import LabelledCollection -from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction +from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction from quapy.method.aggregative import BaseQuantifier +import inspect class GridSearchQ(BaseQuantifier): @@ -74,8 +75,10 @@ class GridSearchQ(BaseQuantifier): self.timeout = timeout self.verbose = verbose self.__check_error(error) - assert self.protocol in {'app', 'npp'}, \ - 'unknown protocol; valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence protocols' + assert self.protocol in {'app', 'npp', 'gen'}, \ + 'unknown protocol: valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence ' \ + 'protocols. Use protocol="gen" when passing a generator function thorough val_split that yields a ' \ + 'sample (instances) and their prevalence (ndarray) at each iteration.' if self.protocol == 'npp': if self.n_repetitions is None or self.n_repetitions == 1: if self.eval_budget is not None: @@ -99,9 +102,14 @@ class GridSearchQ(BaseQuantifier): assert 0. < validation < 1., 'validation proportion should be in (0,1)' training, validation = training.split_stratified(train_prop=1 - validation) return training, validation + elif self.protocol=='gen' and inspect.isgenerator(validation()): + return training, validation else: raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the' - f'proportion of training documents to extract (type found: {type(validation)})') + f'proportion of training documents to extract (type found: {type(validation)}). ' + f'Optionally, "validation" can be a callable function returning a generator that yields ' + f'the sample instances along with their true prevalence at each iteration by ' + f'setting protocol="gen".') def __check_error(self, error): if error in qp.error.QUANTIFICATION_ERROR: @@ -132,6 +140,8 @@ class GridSearchQ(BaseQuantifier): return natural_prevalence_prediction( model, val_split, self.sample_size, **commons) + elif self.protocol == 'gen': + return gen_prevalence_prediction(model, gen_fn=val_split, eval_budget=self.eval_budget) else: raise ValueError('unknown protocol') @@ -144,7 +154,8 @@ class GridSearchQ(BaseQuantifier): if val_split is None: val_split = self.val_split training, val_split = self.__check_training_validation(training, val_split) - assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer' + if self.protocol != 'gen': + assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer' params_keys = list(self.param_grid.keys()) params_values = list(self.param_grid.values()) @@ -192,8 +203,6 @@ class GridSearchQ(BaseQuantifier): raise TimeoutError('all jobs took more than the timeout time to end') self.sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})') - # model.set_params(**self.best_params_) - # self.best_model_ = deepcopy(model) if self.refit: self.sout(f'refitting on the whole development set') @@ -203,11 +212,11 @@ class GridSearchQ(BaseQuantifier): def quantify(self, instances): assert hasattr(self, 'best_model_'), 'quantify called before fit' - return self.best_model_.quantify(instances) + return self.best_model().quantify(instances) @property def classes_(self): - return self.best_model_.classes_ + return self.best_model().classes_ def set_params(self, **parameters): self.param_grid = parameters