fixed embeddings remote import and dataset name in logfile
This commit is contained in:
parent
df957e8448
commit
601da33836
|
|
@ -1,4 +1,3 @@
|
||||||
from sklearn.svm import SVC
|
|
||||||
import os, sys
|
import os, sys
|
||||||
from dataset_builder import MultilingualDataset
|
from dataset_builder import MultilingualDataset
|
||||||
from learning.learners import *
|
from learning.learners import *
|
||||||
|
|
@ -6,6 +5,7 @@ from util.evaluation import *
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
from util.file import exists
|
from util.file import exists
|
||||||
from util.results import PolylingualClassificationResults
|
from util.results import PolylingualClassificationResults
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
|
||||||
|
|
||||||
parser = OptionParser()
|
parser = OptionParser()
|
||||||
|
|
@ -20,7 +20,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
||||||
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
||||||
|
|
||||||
parser.add_option("-w", "--we-path", dest="we_path",
|
parser.add_option("-w", "--we-path", dest="we_path",
|
||||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings')
|
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings/')
|
||||||
|
|
||||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||||
help="Set the C parameter", default=1)
|
help="Set the C parameter", default=1)
|
||||||
|
|
@ -63,41 +63,6 @@ if __name__ == '__main__':
|
||||||
lXtr, lytr = data.training()
|
lXtr, lytr = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
||||||
print(lXtr.keys())
|
|
||||||
|
|
||||||
small_lXtr = dict()
|
|
||||||
small_lytr = dict()
|
|
||||||
small_lXte = dict()
|
|
||||||
small_lyte = dict()
|
|
||||||
|
|
||||||
small_lXtr['da'] = lXtr['da'][:50]
|
|
||||||
small_lytr['da'] = lytr['da'][:50]
|
|
||||||
# small_lXtr['en'] = lXtr['en'][:50]
|
|
||||||
# small_lytr['en'] = lytr['en'][:50]
|
|
||||||
# small_lXtr['fr'] = lXtr['fr'][:50]
|
|
||||||
# small_lytr['fr'] = lytr['fr'][:50]
|
|
||||||
# small_lXte['da'] = lXte['da'][:50]
|
|
||||||
# small_lyte['da'] = lyte['da'][:50]
|
|
||||||
# small_lXte['en'] = lXte['en'][:50]
|
|
||||||
# small_lyte['en'] = lyte['en'][:50]
|
|
||||||
# small_lXte['fr'] = lXte['fr'][:50]
|
|
||||||
# small_lyte['fr'] = lyte['fr'][:50]
|
|
||||||
# small_lXtr['it'] = lXtr['it'][:50]
|
|
||||||
# small_lytr['it'] = lytr['it'][:50]
|
|
||||||
# small_lXtr['es'] = lXtr['es'][:50]
|
|
||||||
# small_lytr['es'] = lytr['es'][:50]
|
|
||||||
# small_lXtr['de'] = lXtr['de'][:50]
|
|
||||||
# small_lytr['de'] = lytr['de'][:50]
|
|
||||||
# small_lXtr['pt'] = lXtr['pt'][:50]
|
|
||||||
# small_lytr['pt'] = lytr['pt'][:50]
|
|
||||||
# small_lXtr['nl'] = lXtr['de'][:50]
|
|
||||||
# small_lytr['nl'] = lytr['de'][:50]
|
|
||||||
# small_lXtr['fi'] = lXtr['fi'][:50]
|
|
||||||
# small_lytr['fi'] = lytr['fi'][:50]
|
|
||||||
# small_lXtr['hu'] = lXtr['hu'][:50]
|
|
||||||
# small_lytr['hu'] = lytr['hu'][:50]
|
|
||||||
# small_lXtr['sv'] = lXtr['sv'][:50]
|
|
||||||
# small_lytr['sv'] = lytr['sv'][:50]
|
|
||||||
|
|
||||||
if op.set_c != -1:
|
if op.set_c != -1:
|
||||||
meta_parameters = None
|
meta_parameters = None
|
||||||
|
|
@ -137,7 +102,7 @@ if __name__ == '__main__':
|
||||||
n_jobs=op.n_jobs)
|
n_jobs=op.n_jobs)
|
||||||
|
|
||||||
print('# Fitting ...')
|
print('# Fitting ...')
|
||||||
classifier.fit(small_lXtr, small_lytr)
|
classifier.fit(lXtr, lytr)
|
||||||
|
|
||||||
print('# Evaluating ...')
|
print('# Evaluating ...')
|
||||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||||
|
|
@ -147,5 +112,6 @@ if __name__ == '__main__':
|
||||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, 'test_datasetname', 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
|
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1],
|
||||||
|
'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
|
||||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||||
|
|
|
||||||
|
|
@ -147,10 +147,13 @@ class FastTextWikiNews(Vectors):
|
||||||
|
|
||||||
url_base = 'Cant auto-download MUSE embeddings'
|
url_base = 'Cant auto-download MUSE embeddings'
|
||||||
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
||||||
|
_name = 'wiki.multi.{}.vec'
|
||||||
|
|
||||||
def __init__(self, cache, language="en", **kwargs):
|
def __init__(self, cache, language="en", **kwargs):
|
||||||
url = self.url_base.format(language)
|
url = self.url_base.format(language)
|
||||||
name = self.path.format(language)
|
# name = self.path.format(language)
|
||||||
|
name = cache + self._name.format(language)
|
||||||
|
# print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}')
|
||||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
|
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
|
||||||
from scipy.sparse import issparse, csr_matrix
|
from scipy.sparse import issparse
|
||||||
from sklearn.multiclass import OneVsRestClassifier
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
from sklearn.model_selection import KFold
|
from sklearn.model_selection import KFold
|
||||||
# from sklearn.externals.joblib import Parallel, delayed
|
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
|
@ -28,9 +27,13 @@ class TrivialRejector:
|
||||||
def fit(self, X, y):
|
def fit(self, X, y):
|
||||||
self.cats = y.shape[1]
|
self.cats = y.shape[1]
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
|
def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
|
||||||
|
|
||||||
def predict(self, X): return np.zeros((X.shape[0],self.cats))
|
def predict(self, X): return np.zeros((X.shape[0],self.cats))
|
||||||
|
|
||||||
def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
|
def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
|
||||||
|
|
||||||
def best_params(self): return {}
|
def best_params(self): return {}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -429,60 +432,6 @@ class PolylingualEmbeddingsClassifier:
|
||||||
return self.model.best_params()
|
return self.model.best_params()
|
||||||
|
|
||||||
|
|
||||||
class FunnellingEmbeddingPolylingualClassifier:
|
|
||||||
""" Simulated: this setting is merely for testing purposes, and is not realistic. We here assume to have a tfidf
|
|
||||||
vectorizer for the out-of-scope languages (which is not fair)."""
|
|
||||||
def __init__(self, first_tier_learner, embed_learner, meta_learner, wordembeddings_path, training_languages,
|
|
||||||
first_tier_parameters = None, embed_parameters = None, meta_parameters = None, n_jobs=-1):
|
|
||||||
|
|
||||||
assert first_tier_learner.probability==True and embed_learner.probability==True, \
|
|
||||||
'both the first-tier classifier and the polyembedding classifier shoud allow calibration'
|
|
||||||
|
|
||||||
self.training_languages = training_languages
|
|
||||||
|
|
||||||
self.PLE = PolylingualEmbeddingsClassifier(wordembeddings_path, embed_learner,
|
|
||||||
c_parameters=embed_parameters, n_jobs=n_jobs)
|
|
||||||
|
|
||||||
self.Funnelling = FunnellingPolylingualClassifier(first_tier_learner, meta_learner,
|
|
||||||
first_tier_parameters=first_tier_parameters,
|
|
||||||
meta_parameters=meta_parameters, n_jobs=n_jobs)
|
|
||||||
self.n_jobs = n_jobs
|
|
||||||
|
|
||||||
def vectorize(self, lX):
|
|
||||||
return {l:self.PLE.lang_tfidf[l].transform(lX[l]) for l in lX.keys()}
|
|
||||||
|
|
||||||
def fit(self, lX, ly):
|
|
||||||
"""
|
|
||||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
|
||||||
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
self.PLE.fit_vectorizers(lX)
|
|
||||||
tinit = time.time()
|
|
||||||
lX = {l: lX[l] for l in lX.keys() if l in self.training_languages}
|
|
||||||
ly = {l: ly[l] for l in lX.keys() if l in self.training_languages}
|
|
||||||
self.PLE.fit(lX, ly)
|
|
||||||
lZ = self.PLE.predict_proba(lX)
|
|
||||||
self.Funnelling.fit(self.vectorize(lX),ly,lZ,ly)
|
|
||||||
self.time = time.time() - tinit
|
|
||||||
return self
|
|
||||||
|
|
||||||
def predict(self, lX):
|
|
||||||
"""
|
|
||||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
|
||||||
"""
|
|
||||||
lXin = {l: lX[l] for l in lX.keys() if l in self.training_languages}
|
|
||||||
lXout = {l: lX[l] for l in lX.keys() if l not in self.training_languages}
|
|
||||||
|
|
||||||
lZ = self.PLE.predict_proba(lXout)
|
|
||||||
|
|
||||||
return self.Funnelling.predict(self.vectorize(lXin), lZ)
|
|
||||||
|
|
||||||
|
|
||||||
def best_params(self):
|
|
||||||
return {'PLE':self.PLE.best_params(), 'Funnelling':self.Funnelling.best_params()}
|
|
||||||
|
|
||||||
|
|
||||||
class AndreaCLF(FunnellingPolylingualClassifier):
|
class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
we_path,
|
we_path,
|
||||||
|
|
@ -509,6 +458,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
self.lang_tfidf = {}
|
self.lang_tfidf = {}
|
||||||
self.word_embeddings = {}
|
self.word_embeddings = {}
|
||||||
self.supervised_embeddings = {}
|
self.supervised_embeddings = {}
|
||||||
|
self.model = None
|
||||||
|
self.time = None
|
||||||
|
|
||||||
def vectorize(self, lX, prediction=False):
|
def vectorize(self, lX, prediction=False):
|
||||||
langs = list(lX.keys())
|
langs = list(lX.keys())
|
||||||
|
|
@ -571,7 +522,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
if supervised:
|
if supervised:
|
||||||
for lang in languages:
|
for lang in languages:
|
||||||
S = WCE_matrix(lX, ly, lang)
|
S = WCE_matrix(lX, ly, lang)
|
||||||
S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
|
# S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
|
||||||
self.supervised_embeddings[lang] = S
|
self.supervised_embeddings[lang] = S
|
||||||
if unsupervised:
|
if unsupervised:
|
||||||
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
||||||
|
|
@ -590,33 +541,23 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
|
|
||||||
Z, zy = self._get_zspace(lX, ly)
|
Z, zy = self._get_zspace(lX, ly)
|
||||||
|
|
||||||
|
if self.config['supervised'] or self.config['unsupervised']:
|
||||||
# Z vectors is concatenated with doc's embedding weighted sum
|
# Z vectors is concatenated with doc's embedding weighted sum
|
||||||
Z_embedded = dict()
|
Z_embedded = dict()
|
||||||
l_weighted_em = self.embed(lX, ly,
|
l_weighted_em = self.embed(lX, ly,
|
||||||
unsupervised=self.config['unsupervised'],
|
unsupervised=self.config['unsupervised'],
|
||||||
supervised=self.config['supervised'])
|
supervised=self.config['supervised'])
|
||||||
|
|
||||||
if self.config['supervised'] or self.config['unsupervised']:
|
# stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings
|
||||||
for lang in list(lX.keys()):
|
for lang in list(lX.keys()):
|
||||||
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
||||||
Z = Z_embedded
|
Z = Z_embedded
|
||||||
del Z_embedded
|
del Z_embedded
|
||||||
|
|
||||||
# stacking Z_embedded space vertically
|
# stacking Z space vertically
|
||||||
# _vertical_Z = np.vstack([Z_embedded[lang] for lang in self.languages])
|
|
||||||
# _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
|
||||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||||
|
|
||||||
# zlangs = list(Z_embedded.keys()) # creo lista con embedding e poi faccio vstack su lista
|
|
||||||
# for i, lang in enumerate(zlangs):
|
|
||||||
# if i == 0:
|
|
||||||
# _vertical_Z = Z_embedded[lang]
|
|
||||||
# _vertical_Zy = zy[lang]
|
|
||||||
# else:
|
|
||||||
# _vertical_Z = np.vstack((_vertical_Z, Z_embedded[lang]))
|
|
||||||
# _vertical_Zy = np.vstack((_vertical_Zy, zy[lang]))
|
|
||||||
|
|
||||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||||
n_jobs=self.n_jobs)
|
n_jobs=self.n_jobs)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue