From 8368c467dc4d7aee961fcf8b52339e9f81aa5499 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 26 Nov 2021 10:57:49 +0100 Subject: [PATCH] adapting new format --- LeQua2022/baselines_T1.py | 24 ++++++----- LeQua2022/baselines_T2.py | 89 ++++++++++++++++++++++++++++++++++++--- LeQua2022/data.py | 37 ++++++++++------ quapy/evaluation.py | 1 + quapy/model_selection.py | 3 ++ 5 files changed, 125 insertions(+), 29 deletions(-) diff --git a/LeQua2022/baselines_T1.py b/LeQua2022/baselines_T1.py index 621c930..dd548c2 100644 --- a/LeQua2022/baselines_T1.py +++ b/LeQua2022/baselines_T1.py @@ -14,10 +14,10 @@ import constants def baselines(): yield CC(LR(n_jobs=-1)), "CC" - yield ACC(LR(n_jobs=-1)), "ACC" - yield PCC(LR(n_jobs=-1)), "PCC" - yield PACC(LR(n_jobs=-1)), "PACC" - yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" + # yield ACC(LR(n_jobs=-1)), "ACC" + # yield PCC(LR(n_jobs=-1)), "PCC" + # yield PACC(LR(n_jobs=-1)), "PACC" + # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" # yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" # yield MLPE(), "MLPE" @@ -28,7 +28,7 @@ def main(args): path_dev_vectors = os.path.join(args.datadir, 'dev_vectors') path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv') - path_train = os.path.join(args.datadir, 'training_vectors.txt') + path_train = os.path.join(args.datadir, 'training_vectors.csv') qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] @@ -46,13 +46,15 @@ def main(args): # } param_grid = { - 'C': [1], + 'C': [0.01], 'class_weight': ['balanced'] } + target_metric = qp.error.mrae def gen_samples(): return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False, - load_fn=load_vector_documents, nF=nF) + load_fn=load_vector_documents, ext='csv') + for quantifier, q_name in baselines(): print(f'{q_name}: Model selection') @@ -61,12 +63,12 @@ def main(args): param_grid, sample_size=None, protocol='gen', - error=qp.error.mae, + error=target_metric, #qp.error.mae, refit=False, verbose=True ).fit(train, gen_samples) - print(f'{q_name} got MAE={quantifier.best_score_:.3f} (hyper-params: {quantifier.best_params_})') + print(f'{q_name} got MRAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})') model_path = os.path.join(models_path, q_name+'.pkl') print(f'saving model in {model_path}') @@ -91,8 +93,8 @@ if __name__ == '__main__': raise ValueError(f'path {args.datadir} is not a valid directory') if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")): raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file') - if not os.path.exists(os.path.join(args.datadir, "training_vectors.txt")): - raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.txt" file') + if not os.path.exists(os.path.join(args.datadir, "training_vectors.csv")): + raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.csv" file') if not os.path.exists(os.path.join(args.datadir, "dev_vectors")): raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder') diff --git a/LeQua2022/baselines_T2.py b/LeQua2022/baselines_T2.py index ca70b03..e119aa4 100644 --- a/LeQua2022/baselines_T2.py +++ b/LeQua2022/baselines_T2.py @@ -1,8 +1,14 @@ import argparse import pickle +from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression as LR +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from LeQua2022.pretrained_embeddings import TfidfWordEmbeddingTransformer, WordEmbeddingAverageTransformer +from LeQua2022.word_class_embeddings import WordClassEmbeddingsTransformer, ConcatenateEmbeddingsTransformer from quapy.method.aggregative import * from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE import quapy.functional as F @@ -20,7 +26,7 @@ def baselines(): yield PCC(LR(n_jobs=-1)), "PCC" yield PACC(LR(n_jobs=-1)), "PACC" yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" - yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" + # yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" # yield MLPE(), "MLPE" @@ -35,9 +41,69 @@ def main(args): qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] train = LabelledCollection.load(path_train, load_raw_documents) - tfidf = TfidfVectorizer(lowercase=True, stop_words='english', min_df=4) # TfidfVectorizer(min_df=5) - train.instances = tfidf.fit_transform(train.instances) - nF = train.instances.shape[1] + + if args.mode == 'tfidf1': + tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True) + if args.mode == 'tfidf2': + tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2)) + if args.mode == 'tfidf3': + tfidf = Pipeline([ + ('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True)), + ('svd', TruncatedSVD(n_components=300)) + ]) + if args.mode == 'tfidf4': + tfidf = Pipeline([ + ('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))), + ('svd', TruncatedSVD(n_components=300)) + ]) + if args.mode == 'glove1': + tfidf = Pipeline([ + ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')), + ('zscore', StandardScaler()) + ]) + if args.mode == 'glove2': + tfidf = WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe') + if args.mode == 'glove3': + vect = TfidfVectorizer(min_df=5, sublinear_tf=True) + tfidf = Pipeline([ + ('tfidf', vect), + ('embedding', TfidfWordEmbeddingTransformer( + wordset_name='glove', + features_call=vect.get_feature_names_out, + path='/mnt/1T/Datasets/GloVe')), + ('zscore', StandardScaler()) + ]) + if args.mode == 'glove4': + vect = TfidfVectorizer(min_df=5, sublinear_tf=True) + tfidf = Pipeline([ + ('tfidf', vect), + ('embedding', TfidfWordEmbeddingTransformer( + wordset_name='glove', + features_call=vect.get_feature_names_out, + path='/mnt/1T/Datasets/GloVe')) + ]) + if args.mode == 'wce1': + tfidf = WordClassEmbeddingsTransformer() + if args.mode == 'wce2': + glove = Pipeline([ + ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')), + ('zscore', StandardScaler()) + ]) + wce = WordClassEmbeddingsTransformer() + tfidf = ConcatenateEmbeddingsTransformer([glove, wce]) + if args.mode == 'wce3': + glove = Pipeline([ + ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')), + ('zscore', StandardScaler()) + ]) + wce = WordClassEmbeddingsTransformer() + tfidf = Pipeline([ + ('glove-wce', ConcatenateEmbeddingsTransformer([glove, wce])), + ('svd', TruncatedSVD(n_components=300)) + ]) + target_metric = qp.error.mrae + + train.instances = tfidf.fit_transform(*train.Xy) print(f'number of classes: {len(train.classes_)}') print(f'number of training documents: {len(train)}') @@ -58,6 +124,7 @@ def main(args): return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False, load_fn=load_raw_unlabelled_documents, vectorizer=tfidf) + outs = [] for quantifier, q_name in baselines(): print(f'{q_name}: Model selection') quantifier = qp.model_selection.GridSearchQ( @@ -65,17 +132,25 @@ def main(args): param_grid, sample_size=None, protocol='gen', - error=qp.error.mae, + error=target_metric, #qp.error.mae, refit=False, verbose=True ).fit(train, gen_samples) - print(f'{q_name} got MAE={quantifier.best_score_:.3f} (hyper-params: {quantifier.best_params_})') + print(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})') + outs.append(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})') model_path = os.path.join(models_path, q_name+'.'+args.task+'.pkl') print(f'saving model in {model_path}') pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) + print(tfidf) + print(args.mode) + print(outs) + with open(f'{args.mode}.{args.task}.txt', 'wt') as foo: + for line in outs: + foo.write(f'{line}\n') + if __name__ == '__main__': parser = argparse.ArgumentParser(description='LeQua2022 Task T2A/T2B baselines') @@ -87,6 +162,8 @@ if __name__ == '__main__': parser.add_argument('modeldir', metavar='MODEL-PATH', type=str, help='Path where to save the models. ' 'A subdirectory named will be automatically created.') + parser.add_argument('mode', metavar='PREPROCESSMODE', type=str, + help='modality of preprocessing') args = parser.parse_args() if not os.path.exists(args.datadir): diff --git a/LeQua2022/data.py b/LeQua2022/data.py index 01dd31b..e581096 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -34,33 +34,42 @@ def load_raw_unlabelled_documents(path, vectorizer=None): return documents, None -def load_vector_documents(path, nF=None): - X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF) - y = y.astype(int) +# def load_vector_documents(path, nF=None): +# X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF, zero_based=True) +# y = y.astype(int) +# return X, y + +def load_vector_documents(path): + D = pd.read_csv(path).to_numpy(dtype=np.float) + labelled = D.shape[1] == 301 + if labelled: + X, y = D[:,:300], D[:,-1].astype(np.int).flatten() + else: + X, y = D, None return X, y -def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs): +def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, ext:str, load_fn, **load_kwargs): true_prevs = ResultSubmission.load(ground_truth_path) for id, prevalence in true_prevs.iterrows(): - sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) + sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs) yield (id, sample, prevalence) if return_id else (sample, prevalence) -def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs): - nsamples = len(glob(os.path.join(path_dir, '*.txt'))) +def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, ext:str, load_fn, **load_kwargs): + nsamples = len(glob(os.path.join(path_dir, f'*.{ext}'))) for id in range(nsamples): - sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) + sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs) yield (id, sample) if return_id else sample -def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, load_fn=load_vector_documents, **load_kwargs): +def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, ext='txt', load_fn=load_vector_documents, **load_kwargs): if ground_truth_path is None: # the generator function returns tuples (docid:str, sample:csr_matrix or str) - gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs) + gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, ext, load_fn, **load_kwargs) else: # the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray) - gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs) + gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, ext, load_fn, **load_kwargs) for r in gen_fn: yield r @@ -139,7 +148,11 @@ class ResultSubmission: @classmethod def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: - df = pd.read_csv(path, index_col=0) + try: + df = pd.read_csv(path, index_col=0) + except Exception as e: + print(f'the file {path} does not seem to be a valid csv file. ') + print(e) return ResultSubmission.check_dataframe_format(df, path=path) @classmethod diff --git a/quapy/evaluation.py b/quapy/evaluation.py index ff0b356..5b75e21 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -24,6 +24,7 @@ def artificial_prevalence_prediction( verbose=False): """ Performs the predictions for all samples generated according to the artificial sampling protocol. + :param model: the model in charge of generating the class prevalence estimations :param test: the test set on which to perform arificial sampling :param sample_size: the size of the samples diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 11a9cab..5af4b2f 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -3,6 +3,8 @@ import signal from copy import deepcopy from typing import Union, Callable +import numpy as np + import quapy as qp from quapy.data.base import LabelledCollection from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction @@ -190,6 +192,7 @@ class GridSearchQ(BaseQuantifier): model.fit(training) true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split) score = self.error(true_prevalences, estim_prevalences) + self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}') if self.best_score_ is None or score < self.best_score_: self.best_score_ = score