From 601da338362d5d65c6ac93dd0dfec521d8c7e6f2 Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 28 Nov 2019 23:10:14 +0100 Subject: [PATCH] fixed embeddings remote import and dataset name in logfile --- src/FPEC_andrea.py | 44 +++---------------- src/data/embeddings.py | 5 ++- src/learning/learners.py | 91 +++++++--------------------------------- 3 files changed, 25 insertions(+), 115 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 3b27def..ed203ce 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -1,4 +1,3 @@ -from sklearn.svm import SVC import os, sys from dataset_builder import MultilingualDataset from learning.learners import * @@ -6,6 +5,7 @@ from util.evaluation import * from optparse import OptionParser from util.file import exists from util.results import PolylingualClassificationResults +from sklearn.svm import SVC parser = OptionParser() @@ -20,7 +20,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed", help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none') parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings') + help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings/') parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) @@ -63,41 +63,6 @@ if __name__ == '__main__': lXtr, lytr = data.training() lXte, lyte = data.test() - print(lXtr.keys()) - - small_lXtr = dict() - small_lytr = dict() - small_lXte = dict() - small_lyte = dict() - - small_lXtr['da'] = lXtr['da'][:50] - small_lytr['da'] = lytr['da'][:50] - # small_lXtr['en'] = lXtr['en'][:50] - # small_lytr['en'] = lytr['en'][:50] - # small_lXtr['fr'] = lXtr['fr'][:50] - # small_lytr['fr'] = lytr['fr'][:50] - # small_lXte['da'] = lXte['da'][:50] - # small_lyte['da'] = lyte['da'][:50] - # small_lXte['en'] = lXte['en'][:50] - # small_lyte['en'] = lyte['en'][:50] - # small_lXte['fr'] = lXte['fr'][:50] - # small_lyte['fr'] = lyte['fr'][:50] - # small_lXtr['it'] = lXtr['it'][:50] - # small_lytr['it'] = lytr['it'][:50] - # small_lXtr['es'] = lXtr['es'][:50] - # small_lytr['es'] = lytr['es'][:50] - # small_lXtr['de'] = lXtr['de'][:50] - # small_lytr['de'] = lytr['de'][:50] - # small_lXtr['pt'] = lXtr['pt'][:50] - # small_lytr['pt'] = lytr['pt'][:50] - # small_lXtr['nl'] = lXtr['de'][:50] - # small_lytr['nl'] = lytr['de'][:50] - # small_lXtr['fi'] = lXtr['fi'][:50] - # small_lytr['fi'] = lytr['fi'][:50] - # small_lXtr['hu'] = lXtr['hu'][:50] - # small_lytr['hu'] = lytr['hu'][:50] - # small_lXtr['sv'] = lXtr['sv'][:50] - # small_lytr['sv'] = lytr['sv'][:50] if op.set_c != -1: meta_parameters = None @@ -137,7 +102,7 @@ if __name__ == '__main__': n_jobs=op.n_jobs) print('# Fitting ...') - classifier.fit(small_lXtr, small_lytr) + classifier.fit(lXtr, lytr) print('# Evaluating ...') l_eval = evaluate_method(classifier, lXte, lyte) @@ -147,5 +112,6 @@ if __name__ == '__main__': macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, 'test_datasetname', 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope') + results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1], + 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 7056d3b..0a7aa4c 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -147,10 +147,13 @@ class FastTextWikiNews(Vectors): url_base = 'Cant auto-download MUSE embeddings' path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' + _name = 'wiki.multi.{}.vec' def __init__(self, cache, language="en", **kwargs): url = self.url_base.format(language) - name = self.path.format(language) + # name = self.path.format(language) + name = cache + self._name.format(language) + # print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}') super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) diff --git a/src/learning/learners.py b/src/learning/learners.py index e540018..77895ce 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,11 +1,10 @@ import numpy as np import time from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix -from scipy.sparse import issparse, csr_matrix +from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold -# from sklearn.externals.joblib import Parallel, delayed from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer @@ -28,9 +27,13 @@ class TrivialRejector: def fit(self, X, y): self.cats = y.shape[1] return self + def decision_function(self, X): return np.zeros((X.shape[0],self.cats)) + def predict(self, X): return np.zeros((X.shape[0],self.cats)) + def predict_proba(self, X): return np.zeros((X.shape[0],self.cats)) + def best_params(self): return {} @@ -429,60 +432,6 @@ class PolylingualEmbeddingsClassifier: return self.model.best_params() -class FunnellingEmbeddingPolylingualClassifier: - """ Simulated: this setting is merely for testing purposes, and is not realistic. We here assume to have a tfidf - vectorizer for the out-of-scope languages (which is not fair).""" - def __init__(self, first_tier_learner, embed_learner, meta_learner, wordembeddings_path, training_languages, - first_tier_parameters = None, embed_parameters = None, meta_parameters = None, n_jobs=-1): - - assert first_tier_learner.probability==True and embed_learner.probability==True, \ - 'both the first-tier classifier and the polyembedding classifier shoud allow calibration' - - self.training_languages = training_languages - - self.PLE = PolylingualEmbeddingsClassifier(wordembeddings_path, embed_learner, - c_parameters=embed_parameters, n_jobs=n_jobs) - - self.Funnelling = FunnellingPolylingualClassifier(first_tier_learner, meta_learner, - first_tier_parameters=first_tier_parameters, - meta_parameters=meta_parameters, n_jobs=n_jobs) - self.n_jobs = n_jobs - - def vectorize(self, lX): - return {l:self.PLE.lang_tfidf[l].transform(lX[l]) for l in lX.keys()} - - def fit(self, lX, ly): - """ - :param lX: a dictionary {language_label: [list of preprocessed documents]} - :param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels} - :return: - """ - self.PLE.fit_vectorizers(lX) - tinit = time.time() - lX = {l: lX[l] for l in lX.keys() if l in self.training_languages} - ly = {l: ly[l] for l in lX.keys() if l in self.training_languages} - self.PLE.fit(lX, ly) - lZ = self.PLE.predict_proba(lX) - self.Funnelling.fit(self.vectorize(lX),ly,lZ,ly) - self.time = time.time() - tinit - return self - - def predict(self, lX): - """ - :param lX: a dictionary {language_label: [list of preprocessed documents]} - """ - lXin = {l: lX[l] for l in lX.keys() if l in self.training_languages} - lXout = {l: lX[l] for l in lX.keys() if l not in self.training_languages} - - lZ = self.PLE.predict_proba(lXout) - - return self.Funnelling.predict(self.vectorize(lXin), lZ) - - - def best_params(self): - return {'PLE':self.PLE.best_params(), 'Funnelling':self.Funnelling.best_params()} - - class AndreaCLF(FunnellingPolylingualClassifier): def __init__(self, we_path, @@ -509,6 +458,8 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.lang_tfidf = {} self.word_embeddings = {} self.supervised_embeddings = {} + self.model = None + self.time = None def vectorize(self, lX, prediction=False): langs = list(lX.keys()) @@ -571,7 +522,7 @@ class AndreaCLF(FunnellingPolylingualClassifier): if supervised: for lang in languages: S = WCE_matrix(lX, ly, lang) - S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging + # S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging self.supervised_embeddings[lang] = S if unsupervised: _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) @@ -590,33 +541,23 @@ class AndreaCLF(FunnellingPolylingualClassifier): Z, zy = self._get_zspace(lX, ly) - # Z vectors is concatenated with doc's embedding weighted sum - Z_embedded = dict() - l_weighted_em = self.embed(lX, ly, - unsupervised=self.config['unsupervised'], - supervised=self.config['supervised']) - if self.config['supervised'] or self.config['unsupervised']: + # Z vectors is concatenated with doc's embedding weighted sum + Z_embedded = dict() + l_weighted_em = self.embed(lX, ly, + unsupervised=self.config['unsupervised'], + supervised=self.config['supervised']) + + # stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings for lang in list(lX.keys()): Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang])) Z = Z_embedded del Z_embedded - # stacking Z_embedded space vertically - # _vertical_Z = np.vstack([Z_embedded[lang] for lang in self.languages]) - # _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) + # stacking Z space vertically _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) - # zlangs = list(Z_embedded.keys()) # creo lista con embedding e poi faccio vstack su lista - # for i, lang in enumerate(zlangs): - # if i == 0: - # _vertical_Z = Z_embedded[lang] - # _vertical_Zy = zy[lang] - # else: - # _vertical_Z = np.vstack((_vertical_Z, Z_embedded[lang])) - # _vertical_Zy = np.vstack((_vertical_Zy, zy[lang])) - print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)