diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 09514de..d7452ba 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -125,9 +125,10 @@ if __name__ == '__main__': result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') - PLE_test = False + PLE_test = True if PLE_test: - ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/moreo/CLESA/PolylingualEmbeddings', + ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/', + config = config, learner=get_learner(calibrate=False), c_parameters=get_params(dense=False), n_jobs=op.n_jobs) @@ -143,7 +144,11 @@ if __name__ == '__main__': macrof1, microf1, macrok, microk = ple_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) + results.add_row('MLE', 'svm', 'no', config['we_type'], + 'no','no', op.optimc, op.dataset.split('/')[-1], ple.time, + lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) + exit() print(f'### PolyEmbedd_andrea_{_config_id}\n') @@ -151,7 +156,7 @@ if __name__ == '__main__': config=config, first_tier_learner=get_learner(calibrate=True), meta_learner=get_learner(calibrate=False, kernel='rbf'), - first_tier_parameters=None, # get_params(dense=False),-->first_tier should not be optimized + first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not? meta_parameters=get_params(dense=True), n_jobs=op.n_jobs) @@ -169,5 +174,5 @@ if __name__ == '__main__': results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], (config['max_label_space'], classifier.best_components), config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, - lang, macrof1, microf1, macrok, microk, '') + lang, macrof1, microf1, macrok, microk, 'min_prevalence = 0') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 9d20ec3..082a9cf 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -226,6 +226,18 @@ class StorageEmbeddings: return def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc): + only_well_represented_C = False # TODO testing + if only_well_represented_C: + labels = labels.copy() + min_prevalence = 0 + print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...') + langs = list(docs.keys()) + well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs]) + # lY = {lY[lang][:, well_repr_cats] for lang in langs} TODO not clear + for lang in langs: + labels[lang] = labels[lang][:, well_repr_cats] + print(f'Target number reduced to: {labels[lang].shape[1]}\n') + for lang in docs.keys(): # compute supervised matrices S - then apply PCA print(f'# [supervised-matrix] for {lang}') self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], diff --git a/src/learning/learners.py b/src/learning/learners.py index 89420bb..5e44bd3 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -461,7 +461,7 @@ class PolylingualEmbeddingsClassifier: } url: https://github.com/facebookresearch/MUSE """ - def __init__(self, wordembeddings_path, learner, c_parameters=None, n_jobs=-1): + def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1): """ :param wordembeddings_path: the path to the directory containing the polylingual embeddings :param learner: the learner @@ -469,11 +469,15 @@ class PolylingualEmbeddingsClassifier: :param n_jobs: the number of concurrent threads """ self.wordembeddings_path = wordembeddings_path + self.config = config self.learner = learner self.c_parameters=c_parameters self.n_jobs = n_jobs self.lang_tfidf = {} self.model = None + self.languages = [] + self.lang_word2idx = dict() + self.embedding_space = None def fit_vectorizers(self, lX): for lang in lX.keys(): @@ -483,6 +487,27 @@ class PolylingualEmbeddingsClassifier: tfidf.fit(docs) self.lang_tfidf[lang] = tfidf + + def vectorize(self, lX, prediction=False): + langs = list(lX.keys()) + print(f'# tfidf-vectorizing docs') + if prediction: + + for lang in langs: + assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' + tfidf_vectorizer = self.lang_tfidf[lang] + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + return self + + for lang in langs: + tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) + self.languages.append(lang) + tfidf_vectorizer.fit(lX[lang]) + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ + self.lang_tfidf[lang] = tfidf_vectorizer + return self + def embed(self, docs, lang): assert lang in self.lang_tfidf, 'unknown language' tfidf_vectorizer = self.lang_tfidf[lang] @@ -515,13 +540,17 @@ class PolylingualEmbeddingsClassifier: tinit = time.time() langs = list(lX.keys()) WEtr, Ytr = [], [] - self.fit_vectorizers(lX) # if already fit, does nothing - for lang in langs: - WEtr.append(self.embed(lX[lang], lang)) - Ytr.append(ly[lang]) + # self.fit_vectorizers(lX) # if already fit, does nothing + self.vectorize(lX) + # config = {'unsupervised' : False, 'supervised': True} + self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly) + WEtr = self.embedding_space.predict(self.config, lX) + # for lang in langs: + # WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices + # Ytr.append(ly[lang]) - WEtr = np.vstack(WEtr) - Ytr = np.vstack(Ytr) + WEtr = np.vstack([WEtr[lang] for lang in langs]) + Ytr = np.vstack([ly[lang] for lang in langs]) self.embed_time = time.time() - tinit print('fitting the WE-space of shape={}'.format(WEtr.shape)) @@ -535,8 +564,10 @@ class PolylingualEmbeddingsClassifier: :param lX: a dictionary {language_label: [list of preprocessed documents]} """ assert self.model is not None, 'predict called before fit' + self.vectorize(lX, prediction=True) langs = list(lX.keys()) - lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory + lWEte = self.embedding_space.predict(self.config, lX) + # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs) def predict_proba(self, lX):