fixed embeddings remote import and dataset name in logfile
This commit is contained in:
parent
df957e8448
commit
601da33836
|
|
@ -1,4 +1,3 @@
|
|||
from sklearn.svm import SVC
|
||||
import os, sys
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.learners import *
|
||||
|
|
@ -6,6 +5,7 @@ from util.evaluation import *
|
|||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from sklearn.svm import SVC
|
||||
|
||||
|
||||
parser = OptionParser()
|
||||
|
|
@ -20,7 +20,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
|||
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings')
|
||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings/')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
|
@ -63,41 +63,6 @@ if __name__ == '__main__':
|
|||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
print(lXtr.keys())
|
||||
|
||||
small_lXtr = dict()
|
||||
small_lytr = dict()
|
||||
small_lXte = dict()
|
||||
small_lyte = dict()
|
||||
|
||||
small_lXtr['da'] = lXtr['da'][:50]
|
||||
small_lytr['da'] = lytr['da'][:50]
|
||||
# small_lXtr['en'] = lXtr['en'][:50]
|
||||
# small_lytr['en'] = lytr['en'][:50]
|
||||
# small_lXtr['fr'] = lXtr['fr'][:50]
|
||||
# small_lytr['fr'] = lytr['fr'][:50]
|
||||
# small_lXte['da'] = lXte['da'][:50]
|
||||
# small_lyte['da'] = lyte['da'][:50]
|
||||
# small_lXte['en'] = lXte['en'][:50]
|
||||
# small_lyte['en'] = lyte['en'][:50]
|
||||
# small_lXte['fr'] = lXte['fr'][:50]
|
||||
# small_lyte['fr'] = lyte['fr'][:50]
|
||||
# small_lXtr['it'] = lXtr['it'][:50]
|
||||
# small_lytr['it'] = lytr['it'][:50]
|
||||
# small_lXtr['es'] = lXtr['es'][:50]
|
||||
# small_lytr['es'] = lytr['es'][:50]
|
||||
# small_lXtr['de'] = lXtr['de'][:50]
|
||||
# small_lytr['de'] = lytr['de'][:50]
|
||||
# small_lXtr['pt'] = lXtr['pt'][:50]
|
||||
# small_lytr['pt'] = lytr['pt'][:50]
|
||||
# small_lXtr['nl'] = lXtr['de'][:50]
|
||||
# small_lytr['nl'] = lytr['de'][:50]
|
||||
# small_lXtr['fi'] = lXtr['fi'][:50]
|
||||
# small_lytr['fi'] = lytr['fi'][:50]
|
||||
# small_lXtr['hu'] = lXtr['hu'][:50]
|
||||
# small_lytr['hu'] = lytr['hu'][:50]
|
||||
# small_lXtr['sv'] = lXtr['sv'][:50]
|
||||
# small_lytr['sv'] = lytr['sv'][:50]
|
||||
|
||||
if op.set_c != -1:
|
||||
meta_parameters = None
|
||||
|
|
@ -137,7 +102,7 @@ if __name__ == '__main__':
|
|||
n_jobs=op.n_jobs)
|
||||
|
||||
print('# Fitting ...')
|
||||
classifier.fit(small_lXtr, small_lytr)
|
||||
classifier.fit(lXtr, lytr)
|
||||
|
||||
print('# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
|
@ -147,5 +112,6 @@ if __name__ == '__main__':
|
|||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, 'test_datasetname', 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
|
||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1],
|
||||
'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
|
|||
|
|
@ -147,10 +147,13 @@ class FastTextWikiNews(Vectors):
|
|||
|
||||
url_base = 'Cant auto-download MUSE embeddings'
|
||||
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
||||
_name = 'wiki.multi.{}.vec'
|
||||
|
||||
def __init__(self, cache, language="en", **kwargs):
|
||||
url = self.url_base.format(language)
|
||||
name = self.path.format(language)
|
||||
# name = self.path.format(language)
|
||||
name = cache + self._name.format(language)
|
||||
# print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}')
|
||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,10 @@
|
|||
import numpy as np
|
||||
import time
|
||||
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
|
||||
from scipy.sparse import issparse, csr_matrix
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import KFold
|
||||
# from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
|
|
@ -28,9 +27,13 @@ class TrivialRejector:
|
|||
def fit(self, X, y):
|
||||
self.cats = y.shape[1]
|
||||
return self
|
||||
|
||||
def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
|
||||
def predict(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
|
||||
def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
|
||||
def best_params(self): return {}
|
||||
|
||||
|
||||
|
|
@ -429,60 +432,6 @@ class PolylingualEmbeddingsClassifier:
|
|||
return self.model.best_params()
|
||||
|
||||
|
||||
class FunnellingEmbeddingPolylingualClassifier:
|
||||
""" Simulated: this setting is merely for testing purposes, and is not realistic. We here assume to have a tfidf
|
||||
vectorizer for the out-of-scope languages (which is not fair)."""
|
||||
def __init__(self, first_tier_learner, embed_learner, meta_learner, wordembeddings_path, training_languages,
|
||||
first_tier_parameters = None, embed_parameters = None, meta_parameters = None, n_jobs=-1):
|
||||
|
||||
assert first_tier_learner.probability==True and embed_learner.probability==True, \
|
||||
'both the first-tier classifier and the polyembedding classifier shoud allow calibration'
|
||||
|
||||
self.training_languages = training_languages
|
||||
|
||||
self.PLE = PolylingualEmbeddingsClassifier(wordembeddings_path, embed_learner,
|
||||
c_parameters=embed_parameters, n_jobs=n_jobs)
|
||||
|
||||
self.Funnelling = FunnellingPolylingualClassifier(first_tier_learner, meta_learner,
|
||||
first_tier_parameters=first_tier_parameters,
|
||||
meta_parameters=meta_parameters, n_jobs=n_jobs)
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def vectorize(self, lX):
|
||||
return {l:self.PLE.lang_tfidf[l].transform(lX[l]) for l in lX.keys()}
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
|
||||
:return:
|
||||
"""
|
||||
self.PLE.fit_vectorizers(lX)
|
||||
tinit = time.time()
|
||||
lX = {l: lX[l] for l in lX.keys() if l in self.training_languages}
|
||||
ly = {l: ly[l] for l in lX.keys() if l in self.training_languages}
|
||||
self.PLE.fit(lX, ly)
|
||||
lZ = self.PLE.predict_proba(lX)
|
||||
self.Funnelling.fit(self.vectorize(lX),ly,lZ,ly)
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def predict(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
"""
|
||||
lXin = {l: lX[l] for l in lX.keys() if l in self.training_languages}
|
||||
lXout = {l: lX[l] for l in lX.keys() if l not in self.training_languages}
|
||||
|
||||
lZ = self.PLE.predict_proba(lXout)
|
||||
|
||||
return self.Funnelling.predict(self.vectorize(lXin), lZ)
|
||||
|
||||
|
||||
def best_params(self):
|
||||
return {'PLE':self.PLE.best_params(), 'Funnelling':self.Funnelling.best_params()}
|
||||
|
||||
|
||||
class AndreaCLF(FunnellingPolylingualClassifier):
|
||||
def __init__(self,
|
||||
we_path,
|
||||
|
|
@ -509,6 +458,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
self.lang_tfidf = {}
|
||||
self.word_embeddings = {}
|
||||
self.supervised_embeddings = {}
|
||||
self.model = None
|
||||
self.time = None
|
||||
|
||||
def vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
|
|
@ -571,7 +522,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
if supervised:
|
||||
for lang in languages:
|
||||
S = WCE_matrix(lX, ly, lang)
|
||||
S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
|
||||
# S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
|
||||
self.supervised_embeddings[lang] = S
|
||||
if unsupervised:
|
||||
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
||||
|
|
@ -590,33 +541,23 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
|
||||
Z, zy = self._get_zspace(lX, ly)
|
||||
|
||||
# Z vectors is concatenated with doc's embedding weighted sum
|
||||
Z_embedded = dict()
|
||||
l_weighted_em = self.embed(lX, ly,
|
||||
unsupervised=self.config['unsupervised'],
|
||||
supervised=self.config['supervised'])
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
# Z vectors is concatenated with doc's embedding weighted sum
|
||||
Z_embedded = dict()
|
||||
l_weighted_em = self.embed(lX, ly,
|
||||
unsupervised=self.config['unsupervised'],
|
||||
supervised=self.config['supervised'])
|
||||
|
||||
# stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings
|
||||
for lang in list(lX.keys()):
|
||||
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
||||
Z = Z_embedded
|
||||
del Z_embedded
|
||||
|
||||
# stacking Z_embedded space vertically
|
||||
# _vertical_Z = np.vstack([Z_embedded[lang] for lang in self.languages])
|
||||
# _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
# stacking Z space vertically
|
||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
|
||||
# zlangs = list(Z_embedded.keys()) # creo lista con embedding e poi faccio vstack su lista
|
||||
# for i, lang in enumerate(zlangs):
|
||||
# if i == 0:
|
||||
# _vertical_Z = Z_embedded[lang]
|
||||
# _vertical_Zy = zy[lang]
|
||||
# else:
|
||||
# _vertical_Z = np.vstack((_vertical_Z, Z_embedded[lang]))
|
||||
# _vertical_Zy = np.vstack((_vertical_Zy, zy[lang]))
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||
n_jobs=self.n_jobs)
|
||||
|
|
|
|||
Loading…
Reference in New Issue