From dd34a96f87c77f9335fd649c3554859ed5c74736 Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 12 Dec 2019 11:18:38 +0100 Subject: [PATCH 1/6] also saving n_components if auto optimizing it removed some unnecessary columns from result csv --- src/FPEC_andrea.py | 48 ++++++-- src/data/embeddings.py | 30 ++--- src/learning/learners.py | 260 ++++++++++++++++++++------------------- src/util/results.py | 7 +- 4 files changed, 189 insertions(+), 156 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 1618c33..16934df 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -35,16 +35,22 @@ parser.add_option("-c", "--optimc", dest="optimc", action='store_true', parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, help="Number of parallel jobs (default is -1, all)", default=-1) -parser.add_option("-p", "--pca", dest="max_labels", type=int, - help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it" - " will automatically search for the best number of components", default=300) +parser.add_option("-p", "--pca", dest="max_labels_S", type=int, + help="If smaller than number of target classes, PCA will be applied to supervised matrix. " + "If set to 0 it will automatically search for the best number of components. " + "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)", + default=300) parser.add_option("-u", "--upca", dest="max_labels_U", type=int, - help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it" - " will automatically search for the best number of components", default=300) + help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." + " If set to 0 it will automatically search for the best number of components", default=300) parser.add_option("-l", dest="lang", type=str) +parser.add_option("-a", dest="post_pca", + help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " + "embedding space", default=False) + def get_learner(calibrate=False, kernel='linear'): return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') @@ -73,7 +79,7 @@ if __name__ == '__main__': data = MultilingualDataset.load(op.dataset) data.show_dimensions() - data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) + # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) # data.set_view(languages=[op.lang]) # data.set_view(categories=list(range(10))) lXtr, lytr = data.training() @@ -114,12 +120,34 @@ if __name__ == '__main__': ##### TODO - config dict is redundant - we have already op argparse ... config['reduction'] = 'PCA' - config['max_label_space'] = op.max_labels + config['max_label_space'] = op.max_labels_S config['dim_reduction_unsupervised'] = op.max_labels_U + config['post_pca'] = op.post_pca # config['plot_covariance_matrices'] = True result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') + PLE_test = False + if PLE_test: + ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/moreo/CLESA/PolylingualEmbeddings', + learner=get_learner(calibrate=False), + c_parameters=get_params(dense=False), + n_jobs=op.n_jobs) + + print('# Fitting ...') + ple.fit(lXtr, lytr) + + print('# Evaluating ...') + ple_eval = evaluate_method(ple, lXte, lyte) + + metrics = [] + for lang in lXte.keys(): + macrof1, microf1, macrok, microk = ple_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) + print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) + + print(f'### PolyEmbedd_andrea_{_config_id}\n') classifier = AndreaCLF(we_path=op.we_path, config=config, @@ -140,6 +168,8 @@ if __name__ == '__main__': macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1], - classifier.time, lang, macrof1, microf1, macrok, microk, '') + results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], + (config['max_label_space'], classifier.best_components), + config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, + lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 66e830f..fb1f135 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -48,7 +48,7 @@ class WordEmbeddings: print('loading pkl in {}'.format(we_path + '.pkl')) (worddim, we) = pickle.load(open(we_path + '.pkl', 'rb')) else: - word_registry=set() + word_registry = set() lines = open(we_path).readlines() nwords, dims = [int(x) for x in lines[0].split()] print('reading we of {} dimensions'.format(dims)) @@ -61,13 +61,13 @@ class WordEmbeddings: word, *vals = line.split() wordp = word_preprocessor(word) if word_preprocessor is not None else word if wordp: - wordp=wordp[0] + wordp = wordp[0] if wordp in word_registry: print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp)) elif len(vals) == dims: worddim[wordp] = index we[index, :] = np.array(vals).astype(float) - index+=1 + index += 1 # else: # print('warning: word <{}> generates an empty string after preprocessing'.format(word)) we = we[:index] @@ -151,7 +151,6 @@ class FastTextWikiNews(Vectors): def __init__(self, cache, language="en", **kwargs): url = self.url_base.format(language) - # name = self.path.format(language) name = cache + self._name.format(language) super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) @@ -211,16 +210,11 @@ class StorageEmbeddings: def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300): for lang in docs.keys(): - nC = self.lang_U[lang].shape[1] print(f'# [unsupervised-matrix {type}] for {lang}') voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors - # if self.lang_U[lang].shape[1] > dim != 0: - # print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than' - # f' the allowed limit {dim}. Applying PCA(n_components={dim})') - # pca = PCA(n_components=dim) - # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') + nC = self.lang_U[lang].shape[1] if max_label_space == 0: print(f'Computing optimal number of PCA components along matrices U') optimal_n = get_optimal_dim(self.lang_U, 'U') @@ -228,22 +222,28 @@ class StorageEmbeddings: elif max_label_space < nC: self.lang_U = run_pca(max_label_space, self.lang_U) + return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): - # if max_label_space == 0: - # print('Computing optimal number of PCA components along matrices S...') - # optimal_n = self.get_optimal_supervised_components(docs, labels) - # max_label_space = optimal_n for lang in docs.keys(): # compute supervised matrices S - then apply PCA - nC = self.lang_S[lang].shape[1] print(f'# [supervised-matrix] for {lang}') self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) + nC = self.lang_S[lang].shape[1] print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') if max_label_space == 0: + print(f'Computing optimal number of PCA components along matrices S') optimal_n = get_optimal_dim(self.lang_S, 'S') self.lang_S = run_pca(optimal_n, self.lang_S) + elif max_label_space == -1: + print(f'Computing PCA on vertical stacked WCE embeddings') + languages = self.lang_S.keys() + _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) + stacked_pca = PCA(n_components=50) + stacked_pca.fit(_temp_stack) + for lang in languages: + self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) elif max_label_space < nC: self.lang_S = run_pca(max_label_space, self.lang_S) diff --git a/src/learning/learners.py b/src/learning/learners.py index 96e200c..1d119e3 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -8,7 +8,7 @@ from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer from transformers.StandardizeTransformer import StandardizeTransformer -# from sklearn.decomposition import PCA +from sklearn.decomposition import PCA def _sort_if_sparse(X): @@ -214,11 +214,6 @@ class NaivePolylingualClassifier: models = Parallel(n_jobs=self.n_jobs)\ (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs) - # - # models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs] - # - # for model, lang in zip(models, langs): - # model.fit(lX[lang], ly[lang]) self.model = {lang: models[i] for i, lang in enumerate(langs)} self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs} @@ -329,6 +324,131 @@ class MonolingualClassifier: return self.best_params_ +class AndreaCLF(FunnellingPolylingualClassifier): + def __init__(self, + we_path, + config, + first_tier_learner, + meta_learner, + first_tier_parameters=None, + meta_parameters=None, + folded_projections=1, + calmode='cal', + n_jobs=-1): + + super().__init__(first_tier_learner, + meta_learner, + first_tier_parameters, + meta_parameters, + folded_projections, + calmode, + n_jobs) + + self.pca_independent_space = PCA(n_components=50) + self.we_path = we_path + self.config = config + self.lang_word2idx = dict() + self.languages = [] + self.lang_tfidf = {} + self.embedding_space = None + self.model = None + self.time = None + self.best_components = None # if auto optimize pca, it will store the optimal number of components + + def vectorize(self, lX, prediction=False): + langs = list(lX.keys()) + print(f'# tfidf-vectorizing docs') + if prediction: + + for lang in langs: + assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' + tfidf_vectorizer = self.lang_tfidf[lang] + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + return self + + for lang in langs: + tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) + self.languages.append(lang) + tfidf_vectorizer.fit(lX[lang]) + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ + self.lang_tfidf[lang] = tfidf_vectorizer + return self + + def _get_zspace(self, lXtr, lYtr): + print('\nfitting the projectors... {}'.format(list(lXtr.keys()))) + self.doc_projector.fit(lXtr, lYtr) + + print('\nprojecting the documents') + lZ = self._projection(self.doc_projector, lXtr) + + return lZ, lYtr + + def fit(self, lX, ly): + tinit = time.time() + print('Vectorizing documents...') + self.vectorize(lX) + + for lang in self.languages: + print(f'{lang}->{lX[lang].shape}') + + Z, zy = self._get_zspace(lX, ly) + + if self.config['supervised'] or self.config['unsupervised']: + self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) + _embedding_space = self.embedding_space.predict(self.config, lX) + if self.config['max_label_space'] == 0: + if _embedding_space.shape[1] - 300 > 0: + _temp = _embedding_space.shape[1] - 300 + else: + _temp = _embedding_space.shape[1] + self.best_components = _temp + # h_stacking posterior probabilities with (U) and/or (S) matrices + for lang in self.languages: + Z[lang] = np.hstack((Z[lang], _embedding_space[lang])) + + # stacking Z space vertically + _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) + _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) + + self.standardizer = StandardizeTransformer() + _vertical_Z = self.standardizer.fit_predict(_vertical_Z) + + # todo testing ... + if self.config['post_pca']: + print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') + self.pca_independent_space.fit(_vertical_Z) + _vertical_Z = self.pca_independent_space.transform(_vertical_Z) + + print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) + self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, + n_jobs=self.n_jobs) + self.model.fit(_vertical_Z, _vertical_Zy) + self.time = time.time() - tinit + print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min') + + def predict(self, lX, ly): + print('Vectorizing documents') + self.vectorize(lX, prediction=True) + lZ = self._projection(self.doc_projector, lX) + + if self.config['supervised'] or self.config['unsupervised']: + _embedding_space = self.embedding_space.predict(self.config, lX) + + for lang in lX.keys(): + lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) + + for lang in lZ.keys(): + print(lZ[lang].shape) + # todo testing + lZ[lang] = self.standardizer.predict(lZ[lang]) + if self.config['post_pca']: + print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') + lZ[lang] = self.pca_independent_space.transform(lZ[lang]) + + return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + + class PolylingualEmbeddingsClassifier: """ This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article @@ -395,24 +515,21 @@ class PolylingualEmbeddingsClassifier: langs = list(lX.keys()) WEtr, Ytr = [], [] self.fit_vectorizers(lX) # if already fit, does nothing - _lX = dict() for lang in langs: - _lX[lang] = self.lang_tfidf[lang].transform(lX[lang]) WEtr.append(self.embed(lX[lang], lang)) Ytr.append(ly[lang]) - # TODO @Andrea --> here embeddings should be stacked horizontally! WEtr = np.vstack(WEtr) Ytr = np.vstack(Ytr) self.embed_time = time.time() - tinit print('fitting the WE-space of shape={}'.format(WEtr.shape)) self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs) - self.model.fit(_lX['da'], ly['da']) + self.model.fit(WEtr, Ytr) self.time = time.time() - tinit return self - def predict(self, lX): + def predict(self, lX, lY): """ :param lX: a dictionary {language_label: [list of preprocessed documents]} """ @@ -427,123 +544,8 @@ class PolylingualEmbeddingsClassifier: """ assert self.model is not None, 'predict called before fit' langs = list(lX.keys()) - # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory - return _joblib_transform_multiling(self.model.predict_proba, self.lang_tfidf['da'], n_jobs=self.n_jobs) + lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory + return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs) def best_params(self): - return self.model.best_params() - - -class AndreaCLF(FunnellingPolylingualClassifier): - def __init__(self, - we_path, - config, - first_tier_learner, - meta_learner, - first_tier_parameters=None, - meta_parameters=None, - folded_projections=1, - calmode='cal', - n_jobs=-1): - - super().__init__(first_tier_learner, - meta_learner, - first_tier_parameters, - meta_parameters, - folded_projections, - calmode, - n_jobs) - - self.pca_independent_space = PCA(n_components=100) - self.we_path = we_path - self.config = config - self.lang_word2idx = dict() - self.languages = [] - self.lang_tfidf = {} - self.embedding_space = None - self.model = None - self.time = None - - def vectorize(self, lX, prediction=False): - langs = list(lX.keys()) - print(f'# tfidf-vectorizing docs') - if prediction: - for lang in langs: - assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' - tfidf_vectorizer = self.lang_tfidf[lang] - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - return self - - for lang in langs: - tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) - self.languages.append(lang) - tfidf_vectorizer.fit(lX[lang]) - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ - self.lang_tfidf[lang] = tfidf_vectorizer - return self - - # @override std class method - def _get_zspace(self, lXtr, lYtr): - print('\nfitting the projectors... {}'.format(list(lXtr.keys()))) - self.doc_projector.fit(lXtr, lYtr) - - print('\nprojecting the documents') - lZ = self._projection(self.doc_projector, lXtr) - - return lZ, lYtr - - # @override std class method - def fit(self, lX, ly): - tinit = time.time() - print('Vectorizing documents...') - self.vectorize(lX) - - for lang in self.languages: - print(f'{lang}->{lX[lang].shape}') - - Z, zy = self._get_zspace(lX, ly) - - if self.config['supervised'] or self.config['unsupervised']: - self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) - _embedding_space = self.embedding_space.predict(self.config, lX) - # h_stacking posterior probabilities with (U) and/or (S) matrices - for lang in self.languages: - Z[lang] = np.hstack((Z[lang], _embedding_space[lang])) - - # stacking Z space vertically - _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) - _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) - - # todo testing ... - # self.pca_independent_space.fit(_vertical_Z) - # _vertical_Z = self.pca_independent_space.transform(_vertical_Z) - - self.standardizer = StandardizeTransformer() - _vertical_Z = self.standardizer.fit_predict(_vertical_Z) - - print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) - self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, - n_jobs=self.n_jobs) - self.model.fit(_vertical_Z, _vertical_Zy) - self.time = time.time() - tinit - print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min') - - def predict(self, lX, ly): - print('Vectorizing documents') - self.vectorize(lX, prediction=True) - lZ = self._projection(self.doc_projector, lX) - - if self.config['supervised'] or self.config['unsupervised']: - _embedding_space = self.embedding_space.predict(self.config, lX) - - for lang in lX.keys(): - lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) - - for lang in lZ.keys(): - print(lZ[lang].shape) - # todo testing - # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) - lZ[lang] = self.standardizer.predict(lZ[lang]) - - return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + return self.model.best_params() \ No newline at end of file diff --git a/src/util/results.py b/src/util/results.py index 7c25bec..a889e6d 100644 --- a/src/util/results.py +++ b/src/util/results.py @@ -5,7 +5,8 @@ import numpy as np class PolylingualClassificationResults: def __init__(self, file, autoflush=True, verbose=False): self.file = file - self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] + self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time', + 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] self.autoflush = autoflush self.verbose = verbose if os.path.exists(file): @@ -20,8 +21,8 @@ class PolylingualClassificationResults: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True) if self.autoflush: self.flush() self.tell(s.to_string()) From 8940c99102ed65f3a5b8a6fe203136e19e48f7ec Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 12 Dec 2019 14:33:41 +0100 Subject: [PATCH 2/6] also saving n_components if auto optimizing it removed some unnecessary columns from result csv --- src/FPEC_andrea.py | 9 ++++----- src/data/embeddings.py | 20 +++++++++++++++++--- src/learning/learners.py | 23 ++++++++++++----------- src/util/decompositions.py | 1 + 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 16934df..0ed414e 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -47,9 +47,9 @@ parser.add_option("-u", "--upca", dest="max_labels_U", type=int, parser.add_option("-l", dest="lang", type=str) -parser.add_option("-a", dest="post_pca", - help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " - "embedding space", default=False) +# parser.add_option("-a", dest="post_pca", +# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " +# "embedding space", default=False) def get_learner(calibrate=False, kernel='linear'): @@ -118,11 +118,10 @@ if __name__ == '__main__': 'we_type': op.we_type} _config_id = 'M_and_F' - ##### TODO - config dict is redundant - we have already op argparse ... config['reduction'] = 'PCA' config['max_label_space'] = op.max_labels_S config['dim_reduction_unsupervised'] = op.max_labels_U - config['post_pca'] = op.post_pca + # config['post_pca'] = op.post_pca # config['plot_covariance_matrices'] = True result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') diff --git a/src/data/embeddings.py b/src/data/embeddings.py index fb1f135..4b19b4a 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -222,26 +222,40 @@ class StorageEmbeddings: elif max_label_space < nC: self.lang_U = run_pca(max_label_space, self.lang_U) - return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): for lang in docs.keys(): # compute supervised matrices S - then apply PCA print(f'# [supervised-matrix] for {lang}') - self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], + reduction, max_label_space, voc[lang], lang) nC = self.lang_S[lang].shape[1] print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') if max_label_space == 0: print(f'Computing optimal number of PCA components along matrices S') optimal_n = get_optimal_dim(self.lang_S, 'S') + print(f'Applying PCA(n_components={optimal_n})') self.lang_S = run_pca(optimal_n, self.lang_S) elif max_label_space == -1: print(f'Computing PCA on vertical stacked WCE embeddings') languages = self.lang_S.keys() _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) - stacked_pca = PCA(n_components=50) + stacked_pca = PCA(n_components=_temp_stack.shape[1]) stacked_pca.fit(_temp_stack) + best_n = None + _r = stacked_pca.explained_variance_ratio_ + _r = np.cumsum(_r) + plt.plot(_r, label='Stacked Supervised') + for i in range(len(_r) - 1, 1, -1): + delta = _r[i] - _r[i - 1] + if delta > 0: + best_n = i + break + plt.show() + stacked_pca = PCA(n_components=best_n) + stacked_pca.fit(_temp_stack) + print(f'Applying PCA(n_components={i}') for lang in languages: self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) elif max_label_space < nC: diff --git a/src/learning/learners.py b/src/learning/learners.py index 1d119e3..5d3f7fa 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -353,7 +353,7 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.embedding_space = None self.model = None self.time = None - self.best_components = None # if auto optimize pca, it will store the optimal number of components + self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components def vectorize(self, lX, prediction=False): langs = list(lX.keys()) @@ -398,10 +398,11 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) _embedding_space = self.embedding_space.predict(self.config, lX) if self.config['max_label_space'] == 0: - if _embedding_space.shape[1] - 300 > 0: - _temp = _embedding_space.shape[1] - 300 + _cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1] + if _cum_dimension - 300 > 0: + _temp = _cum_dimension - 300 else: - _temp = _embedding_space.shape[1] + _temp = _cum_dimension self.best_components = _temp # h_stacking posterior probabilities with (U) and/or (S) matrices for lang in self.languages: @@ -415,10 +416,10 @@ class AndreaCLF(FunnellingPolylingualClassifier): _vertical_Z = self.standardizer.fit_predict(_vertical_Z) # todo testing ... - if self.config['post_pca']: - print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') - self.pca_independent_space.fit(_vertical_Z) - _vertical_Z = self.pca_independent_space.transform(_vertical_Z) + # if self.config['post_pca']: + # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') + # self.pca_independent_space.fit(_vertical_Z) + # _vertical_Z = self.pca_independent_space.transform(_vertical_Z) print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, @@ -442,9 +443,9 @@ class AndreaCLF(FunnellingPolylingualClassifier): print(lZ[lang].shape) # todo testing lZ[lang] = self.standardizer.predict(lZ[lang]) - if self.config['post_pca']: - print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') - lZ[lang] = self.pca_independent_space.transform(lZ[lang]) + # if self.config['post_pca']: + # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') + # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) diff --git a/src/util/decompositions.py b/src/util/decompositions.py index 9029b33..7b50ffc 100644 --- a/src/util/decompositions.py +++ b/src/util/decompositions.py @@ -2,6 +2,7 @@ from sklearn.decomposition import PCA import numpy as np import matplotlib.pyplot as plt + def run_pca(dim, X): """ :param dim: number of pca components to keep From a95511b4d905e5f9d1ce71764cce612a7bf047d6 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 16 Dec 2019 20:46:09 +0100 Subject: [PATCH 3/6] sketched results reader - removed first tier learners optimization --- src/FPEC_andrea.py | 3 +-- src/data/embeddings.py | 6 +++--- src/learning/learners.py | 2 +- src/results/results_manager.py | 7 +++++++ src/util/decompositions.py | 2 +- src/util/util.py | 2 ++ 6 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 src/results/results_manager.py diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 0ed414e..09514de 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -85,7 +85,6 @@ if __name__ == '__main__': lXtr, lytr = data.training() lXte, lyte = data.test() - if op.set_c != -1: meta_parameters = None else: @@ -152,7 +151,7 @@ if __name__ == '__main__': config=config, first_tier_learner=get_learner(calibrate=True), meta_learner=get_learner(calibrate=False, kernel='rbf'), - first_tier_parameters=get_params(dense=False), + first_tier_parameters=None, # get_params(dense=False),-->first_tier should not be optimized meta_parameters=get_params(dense=True), n_jobs=op.n_jobs) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 91cb9ee..1e5da1e 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -220,6 +220,7 @@ class StorageEmbeddings: optimal_n = get_optimal_dim(self.lang_U, 'U') self.lang_U = run_pca(optimal_n, self.lang_U) elif max_label_space < nC: + print(f'Applying PCA to unsupervised matrix U') self.lang_U = run_pca(max_label_space, self.lang_U) return @@ -258,7 +259,8 @@ class StorageEmbeddings: print(f'Applying PCA(n_components={i}') for lang in languages: self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) - elif max_label_space < nC: + elif max_label_space <= nC: + print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})') self.lang_S = run_pca(max_label_space, self.lang_S) return @@ -276,7 +278,6 @@ class StorageEmbeddings: self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) return self - def predict(self, config, docs): if config['supervised'] and config['unsupervised']: return self._concatenate_embeddings(docs) @@ -289,4 +290,3 @@ class StorageEmbeddings: for lang in docs.keys(): _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r - diff --git a/src/learning/learners.py b/src/learning/learners.py index 5d3f7fa..89420bb 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -549,4 +549,4 @@ class PolylingualEmbeddingsClassifier: return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs) def best_params(self): - return self.model.best_params() \ No newline at end of file + return self.model.best_params() diff --git a/src/results/results_manager.py b/src/results/results_manager.py new file mode 100644 index 0000000..af074af --- /dev/null +++ b/src/results/results_manager.py @@ -0,0 +1,7 @@ +import pandas as pd +import numpy as np + +df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t') +pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['embed'], aggfunc=[np.mean, np.std]) +print(pivot) +print('Finished ...') \ No newline at end of file diff --git a/src/util/decompositions.py b/src/util/decompositions.py index 7b50ffc..9d14a0c 100644 --- a/src/util/decompositions.py +++ b/src/util/decompositions.py @@ -47,4 +47,4 @@ def get_optimal_dim(X, embed_type): plt.axvline(best_n, color='r', label='optimal N') plt.legend() plt.show() - return best_n \ No newline at end of file + return best_n diff --git a/src/util/util.py b/src/util/util.py index e69de29..1d7b000 100644 --- a/src/util/util.py +++ b/src/util/util.py @@ -0,0 +1,2 @@ +def fill_missing_classes(lXtr, lytr): + pass From 56ee88220b248ee8759c290c62e03473be99cfa7 Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 17 Dec 2019 10:42:29 +0100 Subject: [PATCH 4/6] typos --- src/data/embeddings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 1e5da1e..9d20ec3 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -225,7 +225,7 @@ class StorageEmbeddings: return - def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): + def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc): for lang in docs.keys(): # compute supervised matrices S - then apply PCA print(f'# [supervised-matrix] for {lang}') self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], @@ -259,7 +259,7 @@ class StorageEmbeddings: print(f'Applying PCA(n_components={i}') for lang in languages: self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) - elif max_label_space <= nC: + elif max_label_space <= nC: # also equal in order to reduce it to the same initial dimension print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})') self.lang_S = run_pca(max_label_space, self.lang_S) @@ -275,7 +275,7 @@ class StorageEmbeddings: if config['unsupervised']: self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised']) if config['supervised']: - self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) + self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) return self def predict(self, config, docs): From 0e66fbf1972d491c334e9fdd8851e3b02b995ee2 Mon Sep 17 00:00:00 2001 From: andrea Date: Sun, 29 Dec 2019 11:54:05 +0100 Subject: [PATCH 5/6] implemented method to compute WCE only for well represented classes - refactored MLE class in order to support WCE, standard embeddings and combinations --- src/FPEC_andrea.py | 13 +++++++---- src/data/embeddings.py | 12 ++++++++++ src/learning/learners.py | 47 +++++++++++++++++++++++++++++++++------- 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 09514de..d7452ba 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -125,9 +125,10 @@ if __name__ == '__main__': result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') - PLE_test = False + PLE_test = True if PLE_test: - ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/moreo/CLESA/PolylingualEmbeddings', + ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/', + config = config, learner=get_learner(calibrate=False), c_parameters=get_params(dense=False), n_jobs=op.n_jobs) @@ -143,7 +144,11 @@ if __name__ == '__main__': macrof1, microf1, macrok, microk = ple_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) + results.add_row('MLE', 'svm', 'no', config['we_type'], + 'no','no', op.optimc, op.dataset.split('/')[-1], ple.time, + lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) + exit() print(f'### PolyEmbedd_andrea_{_config_id}\n') @@ -151,7 +156,7 @@ if __name__ == '__main__': config=config, first_tier_learner=get_learner(calibrate=True), meta_learner=get_learner(calibrate=False, kernel='rbf'), - first_tier_parameters=None, # get_params(dense=False),-->first_tier should not be optimized + first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not? meta_parameters=get_params(dense=True), n_jobs=op.n_jobs) @@ -169,5 +174,5 @@ if __name__ == '__main__': results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], (config['max_label_space'], classifier.best_components), config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, - lang, macrof1, microf1, macrok, microk, '') + lang, macrof1, microf1, macrok, microk, 'min_prevalence = 0') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 9d20ec3..082a9cf 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -226,6 +226,18 @@ class StorageEmbeddings: return def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc): + only_well_represented_C = False # TODO testing + if only_well_represented_C: + labels = labels.copy() + min_prevalence = 0 + print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...') + langs = list(docs.keys()) + well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs]) + # lY = {lY[lang][:, well_repr_cats] for lang in langs} TODO not clear + for lang in langs: + labels[lang] = labels[lang][:, well_repr_cats] + print(f'Target number reduced to: {labels[lang].shape[1]}\n') + for lang in docs.keys(): # compute supervised matrices S - then apply PCA print(f'# [supervised-matrix] for {lang}') self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], diff --git a/src/learning/learners.py b/src/learning/learners.py index 89420bb..5e44bd3 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -461,7 +461,7 @@ class PolylingualEmbeddingsClassifier: } url: https://github.com/facebookresearch/MUSE """ - def __init__(self, wordembeddings_path, learner, c_parameters=None, n_jobs=-1): + def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1): """ :param wordembeddings_path: the path to the directory containing the polylingual embeddings :param learner: the learner @@ -469,11 +469,15 @@ class PolylingualEmbeddingsClassifier: :param n_jobs: the number of concurrent threads """ self.wordembeddings_path = wordembeddings_path + self.config = config self.learner = learner self.c_parameters=c_parameters self.n_jobs = n_jobs self.lang_tfidf = {} self.model = None + self.languages = [] + self.lang_word2idx = dict() + self.embedding_space = None def fit_vectorizers(self, lX): for lang in lX.keys(): @@ -483,6 +487,27 @@ class PolylingualEmbeddingsClassifier: tfidf.fit(docs) self.lang_tfidf[lang] = tfidf + + def vectorize(self, lX, prediction=False): + langs = list(lX.keys()) + print(f'# tfidf-vectorizing docs') + if prediction: + + for lang in langs: + assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' + tfidf_vectorizer = self.lang_tfidf[lang] + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + return self + + for lang in langs: + tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) + self.languages.append(lang) + tfidf_vectorizer.fit(lX[lang]) + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ + self.lang_tfidf[lang] = tfidf_vectorizer + return self + def embed(self, docs, lang): assert lang in self.lang_tfidf, 'unknown language' tfidf_vectorizer = self.lang_tfidf[lang] @@ -515,13 +540,17 @@ class PolylingualEmbeddingsClassifier: tinit = time.time() langs = list(lX.keys()) WEtr, Ytr = [], [] - self.fit_vectorizers(lX) # if already fit, does nothing - for lang in langs: - WEtr.append(self.embed(lX[lang], lang)) - Ytr.append(ly[lang]) + # self.fit_vectorizers(lX) # if already fit, does nothing + self.vectorize(lX) + # config = {'unsupervised' : False, 'supervised': True} + self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly) + WEtr = self.embedding_space.predict(self.config, lX) + # for lang in langs: + # WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices + # Ytr.append(ly[lang]) - WEtr = np.vstack(WEtr) - Ytr = np.vstack(Ytr) + WEtr = np.vstack([WEtr[lang] for lang in langs]) + Ytr = np.vstack([ly[lang] for lang in langs]) self.embed_time = time.time() - tinit print('fitting the WE-space of shape={}'.format(WEtr.shape)) @@ -535,8 +564,10 @@ class PolylingualEmbeddingsClassifier: :param lX: a dictionary {language_label: [list of preprocessed documents]} """ assert self.model is not None, 'predict called before fit' + self.vectorize(lX, prediction=True) langs = list(lX.keys()) - lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory + lWEte = self.embedding_space.predict(self.config, lX) + # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs) def predict_proba(self, lX): From 53198a7e2cd63dd9c55cc4fc226d9fa9011409fa Mon Sep 17 00:00:00 2001 From: andrea Date: Tue, 7 Jan 2020 17:05:41 +0100 Subject: [PATCH 6/6] implemented method to compute WCE only for well represented classes; refactored MLE class in order to support WCE, standard embeddings and combinations; sketched out NN implementation for WE compositionality; still TODO SIF embeddings; --- src/FPEC_andrea.py | 32 +----- src/MLE_andrea.py | 128 +++++++++++++++++++++++ src/NN_FPEC_andrea.py | 92 ++++++++++++++++ src/{data => learning}/embeddings.py | 19 ++-- src/learning/learners.py | 151 ++++++++++++++++++++++++++- src/{data => learning}/supervised.py | 0 src/models/cnn_class.py | 42 ++++++++ src/results/results_manager.py | 2 +- src/util/SIF_embed.py | 56 ++++++++++ src/util/util.py | 13 +++ 10 files changed, 498 insertions(+), 37 deletions(-) create mode 100644 src/MLE_andrea.py create mode 100644 src/NN_FPEC_andrea.py rename src/{data => learning}/embeddings.py (94%) rename src/{data => learning}/supervised.py (100%) create mode 100644 src/models/cnn_class.py create mode 100644 src/util/SIF_embed.py diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index d7452ba..3c351b6 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -6,7 +6,7 @@ from optparse import OptionParser from util.file import exists from util.results import PolylingualClassificationResults from sklearn.svm import SVC - +from util.util import get_learner, get_params parser = OptionParser() @@ -115,7 +115,7 @@ if __name__ == '__main__': config = {'unsupervised': True, 'supervised': True, 'we_type': op.we_type} - _config_id = 'M_and_F' + _config_id = 'M+F' config['reduction'] = 'PCA' config['max_label_space'] = op.max_labels_S @@ -125,32 +125,6 @@ if __name__ == '__main__': result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') - PLE_test = True - if PLE_test: - ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/', - config = config, - learner=get_learner(calibrate=False), - c_parameters=get_params(dense=False), - n_jobs=op.n_jobs) - - print('# Fitting ...') - ple.fit(lXtr, lytr) - - print('# Evaluating ...') - ple_eval = evaluate_method(ple, lXte, lyte) - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = ple_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row('MLE', 'svm', 'no', config['we_type'], - 'no','no', op.optimc, op.dataset.split('/')[-1], ple.time, - lang, macrof1, microf1, macrok, microk, '') - print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) - exit() - - print(f'### PolyEmbedd_andrea_{_config_id}\n') classifier = AndreaCLF(we_path=op.we_path, config=config, @@ -174,5 +148,5 @@ if __name__ == '__main__': results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], (config['max_label_space'], classifier.best_components), config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, - lang, macrof1, microf1, macrok, microk, 'min_prevalence = 0') + lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/MLE_andrea.py b/src/MLE_andrea.py new file mode 100644 index 0000000..51cafc8 --- /dev/null +++ b/src/MLE_andrea.py @@ -0,0 +1,128 @@ +import os +from dataset_builder import MultilingualDataset +from learning.learners import * +from util.evaluation import * +from optparse import OptionParser +from util.file import exists +from util.results import PolylingualClassificationResults +from util.util import get_learner, get_params + +parser = OptionParser() + +parser.add_option("-d", "--dataset", dest="dataset", + help="Path to the multilingual dataset processed and stored in .pickle format", + default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") + +parser.add_option("-o", "--output", dest="output", + help="Result file", type=str, default='./results/results.csv') + +parser.add_option("-e", "--mode-embed", dest="mode_embed", + help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') + +parser.add_option("-w", "--we-path", dest="we_path", + help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/') + +parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, + default='MUSE') + +parser.add_option("-s", "--set_c", dest="set_c",type=float, + help="Set the C parameter", default=1) + +parser.add_option("-c", "--optimc", dest="optimc", action='store_true', + help="Optimize hyperparameters", default=False) + +parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, + help="Number of parallel jobs (default is -1, all)", default=-1) + +parser.add_option("-p", "--pca", dest="max_labels_S", type=int, + help="If smaller than number of target classes, PCA will be applied to supervised matrix. " + "If set to 0 it will automatically search for the best number of components. " + "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)", + default=300) + +parser.add_option("-u", "--upca", dest="max_labels_U", type=int, + help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." + " If set to 0 it will automatically search for the best number of components", default=300) + +parser.add_option("-l", dest="lang", type=str) + +if __name__ == '__main__': + (op, args) = parser.parse_args() + + assert exists(op.dataset), 'Unable to find file '+str(op.dataset) + assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' + + dataset_file = os.path.basename(op.dataset) + + results = PolylingualClassificationResults('./results/PLE_results.csv') + + data = MultilingualDataset.load(op.dataset) + data.show_dimensions() + + # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) + # data.set_view(languages=[op.lang]) + # data.set_view(categories=list(range(10))) + lXtr, lytr = data.training() + lXte, lyte = data.test() + + if op.set_c != -1: + meta_parameters = None + else: + meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}] + + # Embeddings and WCE config + _available_mode = ['none', 'unsupervised', 'supervised', 'both'] + _available_type = ['MUSE', 'FastText'] + assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}' + assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}' + + if op.mode_embed == 'none': + config = {'unsupervised': False, + 'supervised': False, + 'we_type': None} + _config_id = 'None' + elif op.mode_embed == 'unsupervised': + config = {'unsupervised': True, + 'supervised': False, + 'we_type': op.we_type} + _config_id = 'M' + elif op.mode_embed == 'supervised': + config = {'unsupervised': False, + 'supervised': True, + 'we_type': None} + _config_id = 'F' + elif op.mode_embed == 'both': + config = {'unsupervised': True, + 'supervised': True, + 'we_type': op.we_type} + _config_id = 'M+F' + + config['reduction'] = 'PCA' + config['max_label_space'] = op.max_labels_S + config['dim_reduction_unsupervised'] = op.max_labels_U + # config['post_pca'] = op.post_pca + # config['plot_covariance_matrices'] = True + + result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '') + + ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/', + config = config, + learner=get_learner(calibrate=False), + c_parameters=get_params(dense=False), + n_jobs=op.n_jobs) + + print('# Fitting ...') + ple.fit(lXtr, lytr) + + print('# Evaluating ...') + ple_eval = evaluate_method(ple, lXte, lyte) + + metrics = [] + for lang in lXte.keys(): + macrof1, microf1, macrok, microk = ple_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) + results.add_row('MLE', 'svm', _config_id, config['we_type'], + 'no','no', op.optimc, op.dataset.split('/')[-1], ple.time, + lang, macrof1, microf1, macrok, microk, '') + print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/NN_FPEC_andrea.py b/src/NN_FPEC_andrea.py new file mode 100644 index 0000000..156d726 --- /dev/null +++ b/src/NN_FPEC_andrea.py @@ -0,0 +1,92 @@ +from optparse import OptionParser +from util.results import PolylingualClassificationResults +from dataset_builder import MultilingualDataset +from keras.preprocessing.text import Tokenizer +from learning.learners import MonolingualNetSvm +from sklearn.svm import SVC +import pickle + +parser = OptionParser() + +parser.add_option("-d", "--dataset", dest="dataset", + help="Path to the multilingual dataset processed and stored in .pickle format", + default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") + +parser.add_option("-c", "--optimc", dest="optimc", action='store_true', + help="Optimize hyperparameters", default=False) + +parser.add_option("-s", "--set_c", dest="set_c",type=float, + help="Set the C parameter", default=1) + +(op, args) = parser.parse_args() + + +################################################################################################################### + +def get_learner(calibrate=False, kernel='linear'): + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') + + +def get_params(dense=False): + if not op.optimc: + return None + c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] + kernel = 'rbf' if dense else 'linear' + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] + + +# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN +def preprocess_data(lXtr, lXte, lytr, lyte): + tokenized_tr = dict() + tokenized_te = dict() + for lang in lXtr.keys(): + alltexts = ' '.join(lXtr[lang]) + tokenizer = Tokenizer() + tokenizer.fit_on_texts(alltexts.split(' ')) + tokenizer.oov_token = len(tokenizer.word_index)+1 + # dumping train set + sequences_tr = tokenizer.texts_to_sequences(lXtr[lang]) + tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang]) + # dumping test set + sequences_te = tokenizer.texts_to_sequences(lXte[lang]) + tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang]) + + with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f: + pickle.dump(tokenized_tr, f) + + with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f: + pickle.dump(tokenized_tr, f) + + print('Successfully dumped data') + +# def load_preprocessed(): +# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f: +# return pickle.load(f) +# +# def build_embedding_matrix(lang, word_index): +# type = 'MUSE' +# path = '/home/andreapdr/CLESA/' +# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys()) +# return MUSE + + +########## MAIN ################################################################################################# + +if __name__ == '__main__': + results = PolylingualClassificationResults('./results/NN_FPEC_results.csv') + data = MultilingualDataset.load(op.dataset) + lXtr, lytr = data.training() + lXte, lyte = data.test() + + if op.set_c != -1: + meta_parameters = None + else: + meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}] + + test_architecture = MonolingualNetSvm(lXtr, + lytr, + first_tier_learner=get_learner(calibrate=True), + first_tier_parameters=None, + n_jobs=1) + + test_architecture.fit() diff --git a/src/data/embeddings.py b/src/learning/embeddings.py similarity index 94% rename from src/data/embeddings.py rename to src/learning/embeddings.py index 082a9cf..65a5338 100644 --- a/src/data/embeddings.py +++ b/src/learning/embeddings.py @@ -3,8 +3,9 @@ import pickle from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod -from data.supervised import get_supervised_embeddings +from learning.supervised import get_supervised_embeddings from util.decompositions import * +from util.SIF_embed import * class PretrainedEmbeddings(ABC): @@ -233,7 +234,6 @@ class StorageEmbeddings: print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...') langs = list(docs.keys()) well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs]) - # lY = {lY[lang][:, well_repr_cats] for lang in langs} TODO not clear for lang in langs: labels[lang] = labels[lang][:, well_repr_cats] print(f'Target number reduced to: {labels[lang].shape[1]}\n') @@ -245,15 +245,15 @@ class StorageEmbeddings: nC = self.lang_S[lang].shape[1] print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') - if max_label_space == 0: + if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio print(f'Computing optimal number of PCA components along matrices S') optimal_n = get_optimal_dim(self.lang_S, 'S') print(f'Applying PCA(n_components={optimal_n})') self.lang_S = run_pca(optimal_n, self.lang_S) - elif max_label_space == -1: + elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings print(f'Computing PCA on vertical stacked WCE embeddings') languages = self.lang_S.keys() - _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) + _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically stacked_pca = PCA(n_components=_temp_stack.shape[1]) stacked_pca.fit(_temp_stack) best_n = None @@ -271,12 +271,15 @@ class StorageEmbeddings: print(f'Applying PCA(n_components={i}') for lang in languages: self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) - elif max_label_space <= nC: # also equal in order to reduce it to the same initial dimension + elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})') self.lang_S = run_pca(max_label_space, self.lang_S) return + def SIF_embeddings(self): + print('todo') # TODO + def _concatenate_embeddings(self, docs): _r = dict() for lang in self.lang_U.keys(): @@ -293,6 +296,9 @@ class StorageEmbeddings: def predict(self, config, docs): if config['supervised'] and config['unsupervised']: return self._concatenate_embeddings(docs) + # todo testing applying pca to hstack muse + wce + # _reduced = self._concatenate_embeddings(docs) + # return run_pca(300, _reduced) elif config['supervised']: _r = dict() for lang in docs.keys(): @@ -301,4 +307,5 @@ class StorageEmbeddings: _r = dict() for lang in docs.keys(): _r[lang] = docs[lang].dot(self.lang_U[lang]) + return _r diff --git a/src/learning/learners.py b/src/learning/learners.py index 5e44bd3..a678905 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,6 +1,6 @@ import numpy as np import time -from data.embeddings import WordEmbeddings, StorageEmbeddings +from learning.embeddings import WordEmbeddings, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV @@ -9,6 +9,7 @@ from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer from transformers.StandardizeTransformer import StandardizeTransformer from sklearn.decomposition import PCA +from models.cnn_class import CNN_pdr def _sort_if_sparse(X): @@ -581,3 +582,151 @@ class PolylingualEmbeddingsClassifier: def best_params(self): return self.model.best_params() + + +class MonolingualNetSvm: + """ + testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the + number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next, + the projection are fed to a single NN with their respective document embeddings. The documents are projected into + the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally + concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal + to the number of target classes. + # TODO ATM testing with only 1 language + """ + def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs): + self.lX = lX + self.ly = ly + # SVM Attributes + self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters, + n_jobs=n_jobs) + self.calmode = 'cal' + self.languages = [] + self.lang_word2idx = dict() + self.lang_tfidf = {} + self.base_learner = 'TODO' + self.parameters = 'TODO' + # NN Attributes + self.NN = 'TODO' + + + def load_preprocessed(self): + """ + in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and + targets are loaded. + :return: dict[lang] = (word_index, tokenized_docs, targets) + """ + import pickle + with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f: + return pickle.load(f) + + def _build_embedding_matrix(self, lang, word_index): + """ + build embedding matrix by filtering out OOV embeddings + :param lang: + :param word_index: + :return: filtered embedding matrix + """ + from learning.embeddings import EmbeddingsAligned + type = 'MUSE' + path = '/home/andreapdr/CLESA/' + MUSE = EmbeddingsAligned(type, path, lang, word_index.keys()) + return MUSE + + def get_data_and_embed(self, data_dict): + from keras.preprocessing.sequence import pad_sequences + + langs = data_dict.keys() + lang_embedding_matrix = dict() + nn_lXtr = dict() + nn_lytr = dict() + + for lang in langs: + lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0]) + nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post') + nn_lytr[lang] = [data_dict[lang][2]] + + return nn_lXtr, nn_lytr, lang_embedding_matrix + + def svm_vectorize(self, lX, prediction=False): + langs = list(lX.keys()) + print(f'# tfidf-vectorizing docs') + if prediction: + for lang in langs: + assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' + tfidf_vectorizer = self.lang_tfidf[lang] + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + return self + for lang in langs: + tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) + self.languages.append(lang) + tfidf_vectorizer.fit(lX[lang]) + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ + self.lang_tfidf[lang] = tfidf_vectorizer + return lX + + def _get_zspace(self, lXtr, lYtr): + print('\nfitting the projectors... {}'.format(list(lXtr.keys()))) + self.doc_projector.fit(lXtr, lYtr) + + print('\nprojecting the documents') + lZ = self._projection(self.doc_projector, lXtr) + + return lZ, lYtr + + def _projection(self, doc_projector, lX): + """ + Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or + decision_function if otherwise + :param doc_projector: the document projector (a NaivePolylingualClassifier) + :param lX: {lang:matrix} to train + :return: the projection, applied with predict_proba or decision_function + """ + if self.calmode=='cal': + return doc_projector.predict_proba(lX) + else: + l_decision_scores = doc_projector.decision_function(lX) + if self.calmode=='sigmoid': + def sigmoid(x): return 1 / (1 + np.exp(-x)) + for lang in l_decision_scores.keys(): + l_decision_scores[lang] = sigmoid(l_decision_scores[lang]) + return l_decision_scores + + def fit(self): + """ + # 1. Fit SVM to generate posterior probabilities: + # 1.1 Gather documents and vectorize them as in other SVM classifiers + # 2. Fit NN + # 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix + # 2.2 Fit NN first-layer to generate compositional doc embedding + # 2.3 H-stack doc-embed and posterior P + # 2.4 Feed stacked vector to output layer (sigmoid act): output Nc + # 2.5 Train it... + """ + + # load pre-processed data + data_dict = self.load_preprocessed() + # build embedding matrices and neural network document training set + nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict) + # TF-IDF vectorzing documents for SVM classifier + svm_lX = self.svm_vectorize(self.lX) + + # just testing on a smaller subset of data + test_svm_lX = dict() + test_svm_ly = dict() + test_svm_lX['it'] = svm_lX['it'][:10, :] + test_svm_ly['it'] = self.ly['it'][:10, :] + test_nn_data = nn_lXtr['it'][:10] + + # projecting document into Z space by SVM + svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly) + + # initializing net and forward pass + net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors) + out = net.forward(test_nn_data, svm_Z['it']) + + print('TODO') + + def net(self): + pass \ No newline at end of file diff --git a/src/data/supervised.py b/src/learning/supervised.py similarity index 100% rename from src/data/supervised.py rename to src/learning/supervised.py diff --git a/src/models/cnn_class.py b/src/models/cnn_class.py new file mode 100644 index 0000000..a47d5fc --- /dev/null +++ b/src/models/cnn_class.py @@ -0,0 +1,42 @@ +import torch.nn as nn +from torch.nn import functional as F +import torch + +class CNN_pdr(nn.Module): + + def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None, + drop_embedding_prop=0, drop_prob=0.5): + super(CNN_pdr, self).__init__() + self.vocab_size = vocab_size + self.emb_dim = emb_dim + self.embeddings = torch.FloatTensor(embeddings) + self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings) + self.kernel_heights = kernel_heights=[3,5,7] + self.stride = 1 + self.padding = 0 + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' + self.nC = 73 + + self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding) + self.dropout = nn.Dropout(drop_prob) + self.label = nn.Linear(len(kernel_heights) * out_channels, output_size) + self.fC = nn.Linear(compositional_dim + self.nC, self.nC) + + + def forward(self, x, svm_output): + x = torch.LongTensor(x) + svm_output = torch.FloatTensor(svm_output) + x = self.embedding_layer(x) + x = self.conv1(x.unsqueeze(1)) + x = F.relu(x.squeeze(3)) + x = F.max_pool1d(x, x.size()[2]).squeeze(2) + x = torch.cat((x, svm_output), 1) + x = F.sigmoid(self.fC(x)) + return x #.detach().numpy() + + # logits = self.label(x) + # return logits + + diff --git a/src/results/results_manager.py b/src/results/results_manager.py index af074af..fdee8d8 100644 --- a/src/results/results_manager.py +++ b/src/results/results_manager.py @@ -2,6 +2,6 @@ import pandas as pd import numpy as np df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t') -pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['embed'], aggfunc=[np.mean, np.std]) +pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std]) print(pivot) print('Finished ...') \ No newline at end of file diff --git a/src/util/SIF_embed.py b/src/util/SIF_embed.py new file mode 100644 index 0000000..05e2ff7 --- /dev/null +++ b/src/util/SIF_embed.py @@ -0,0 +1,56 @@ +import numpy as np +from sklearn.decomposition import TruncatedSVD + +def get_weighted_average(We, x, w): + """ + Compute the weighted average vectors + :param We: We[i,:] is the vector for word i + :param x: x[i, :] are the indices of the words in sentence i + :param w: w[i, :] are the weights for the words in sentence i + :return: emb[i, :] are the weighted average vector for sentence i + """ + n_samples = x.shape[0] + emb = np.zeros((n_samples, We.shape[1])) + for i in range(n_samples): + emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) + return emb + +def compute_pc(X,npc=1): + """ + Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN! + :param X: X[i,:] is a data point + :param npc: number of principal components to remove + :return: component_[i,:] is the i-th pc + """ + svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) + svd.fit(X) + return svd.components_ + +def remove_pc(X, npc=1): + """ + Remove the projection on the principal components + :param X: X[i,:] is a data point + :param npc: number of principal components to remove + :return: XX[i, :] is the data point after removing its projection + """ + pc = compute_pc(X, npc) + if npc==1: + XX = X - X.dot(pc.transpose()) * pc + else: + XX = X - X.dot(pc.transpose()).dot(pc) + return XX + + +def SIF_embedding(We, x, w, params): + """ + Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component + :param We: We[i,:] is the vector for word i + :param x: x[i, :] are the indices of the words in the i-th sentence + :param w: w[i, :] are the weights for the words in the i-th sentence + :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component + :return: emb, emb[i, :] is the embedding for sentence i + """ + emb = get_weighted_average(We, x, w) + if params.rmpc > 0: + emb = remove_pc(emb, params.rmpc) + return emb \ No newline at end of file diff --git a/src/util/util.py b/src/util/util.py index 1d7b000..325abf4 100644 --- a/src/util/util.py +++ b/src/util/util.py @@ -1,2 +1,15 @@ +from sklearn.svm import SVC + def fill_missing_classes(lXtr, lytr): pass + +def get_learner(calibrate=False, kernel='linear'): + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') + + +def get_params(dense=False): + if not op.optimc: + return None + c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] + kernel = 'rbf' if dense else 'linear' + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] \ No newline at end of file