implemented method to compute WCE only for well represented classes -
refactored MLE class in order to support WCE, standard embeddings and combinations
This commit is contained in:
parent
56ee88220b
commit
0e66fbf197
|
|
@ -125,9 +125,10 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||||
|
|
||||||
PLE_test = False
|
PLE_test = True
|
||||||
if PLE_test:
|
if PLE_test:
|
||||||
ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/moreo/CLESA/PolylingualEmbeddings',
|
ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
|
||||||
|
config = config,
|
||||||
learner=get_learner(calibrate=False),
|
learner=get_learner(calibrate=False),
|
||||||
c_parameters=get_params(dense=False),
|
c_parameters=get_params(dense=False),
|
||||||
n_jobs=op.n_jobs)
|
n_jobs=op.n_jobs)
|
||||||
|
|
@ -143,7 +144,11 @@ if __name__ == '__main__':
|
||||||
macrof1, microf1, macrok, microk = ple_eval[lang]
|
macrof1, microf1, macrok, microk = ple_eval[lang]
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||||
|
results.add_row('MLE', 'svm', 'no', config['we_type'],
|
||||||
|
'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
|
||||||
|
lang, macrof1, microf1, macrok, microk, '')
|
||||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||||
|
exit()
|
||||||
|
|
||||||
|
|
||||||
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
||||||
|
|
@ -151,7 +156,7 @@ if __name__ == '__main__':
|
||||||
config=config,
|
config=config,
|
||||||
first_tier_learner=get_learner(calibrate=True),
|
first_tier_learner=get_learner(calibrate=True),
|
||||||
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||||
first_tier_parameters=None, # get_params(dense=False),-->first_tier should not be optimized
|
first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
|
||||||
meta_parameters=get_params(dense=True),
|
meta_parameters=get_params(dense=True),
|
||||||
n_jobs=op.n_jobs)
|
n_jobs=op.n_jobs)
|
||||||
|
|
||||||
|
|
@ -169,5 +174,5 @@ if __name__ == '__main__':
|
||||||
results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||||
(config['max_label_space'], classifier.best_components),
|
(config['max_label_space'], classifier.best_components),
|
||||||
config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||||
lang, macrof1, microf1, macrok, microk, '')
|
lang, macrof1, microf1, macrok, microk, 'min_prevalence = 0')
|
||||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||||
|
|
|
||||||
|
|
@ -226,6 +226,18 @@ class StorageEmbeddings:
|
||||||
return
|
return
|
||||||
|
|
||||||
def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
||||||
|
only_well_represented_C = False # TODO testing
|
||||||
|
if only_well_represented_C:
|
||||||
|
labels = labels.copy()
|
||||||
|
min_prevalence = 0
|
||||||
|
print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
|
||||||
|
langs = list(docs.keys())
|
||||||
|
well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
|
||||||
|
# lY = {lY[lang][:, well_repr_cats] for lang in langs} TODO not clear
|
||||||
|
for lang in langs:
|
||||||
|
labels[lang] = labels[lang][:, well_repr_cats]
|
||||||
|
print(f'Target number reduced to: {labels[lang].shape[1]}\n')
|
||||||
|
|
||||||
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
||||||
print(f'# [supervised-matrix] for {lang}')
|
print(f'# [supervised-matrix] for {lang}')
|
||||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
|
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
|
||||||
|
|
|
||||||
|
|
@ -461,7 +461,7 @@ class PolylingualEmbeddingsClassifier:
|
||||||
}
|
}
|
||||||
url: https://github.com/facebookresearch/MUSE
|
url: https://github.com/facebookresearch/MUSE
|
||||||
"""
|
"""
|
||||||
def __init__(self, wordembeddings_path, learner, c_parameters=None, n_jobs=-1):
|
def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
|
||||||
"""
|
"""
|
||||||
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
|
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
|
||||||
:param learner: the learner
|
:param learner: the learner
|
||||||
|
|
@ -469,11 +469,15 @@ class PolylingualEmbeddingsClassifier:
|
||||||
:param n_jobs: the number of concurrent threads
|
:param n_jobs: the number of concurrent threads
|
||||||
"""
|
"""
|
||||||
self.wordembeddings_path = wordembeddings_path
|
self.wordembeddings_path = wordembeddings_path
|
||||||
|
self.config = config
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
self.c_parameters=c_parameters
|
self.c_parameters=c_parameters
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
self.lang_tfidf = {}
|
self.lang_tfidf = {}
|
||||||
self.model = None
|
self.model = None
|
||||||
|
self.languages = []
|
||||||
|
self.lang_word2idx = dict()
|
||||||
|
self.embedding_space = None
|
||||||
|
|
||||||
def fit_vectorizers(self, lX):
|
def fit_vectorizers(self, lX):
|
||||||
for lang in lX.keys():
|
for lang in lX.keys():
|
||||||
|
|
@ -483,6 +487,27 @@ class PolylingualEmbeddingsClassifier:
|
||||||
tfidf.fit(docs)
|
tfidf.fit(docs)
|
||||||
self.lang_tfidf[lang] = tfidf
|
self.lang_tfidf[lang] = tfidf
|
||||||
|
|
||||||
|
|
||||||
|
def vectorize(self, lX, prediction=False):
|
||||||
|
langs = list(lX.keys())
|
||||||
|
print(f'# tfidf-vectorizing docs')
|
||||||
|
if prediction:
|
||||||
|
|
||||||
|
for lang in langs:
|
||||||
|
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
||||||
|
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||||
|
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||||
|
return self
|
||||||
|
|
||||||
|
for lang in langs:
|
||||||
|
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
||||||
|
self.languages.append(lang)
|
||||||
|
tfidf_vectorizer.fit(lX[lang])
|
||||||
|
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||||
|
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||||
|
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||||
|
return self
|
||||||
|
|
||||||
def embed(self, docs, lang):
|
def embed(self, docs, lang):
|
||||||
assert lang in self.lang_tfidf, 'unknown language'
|
assert lang in self.lang_tfidf, 'unknown language'
|
||||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||||
|
|
@ -515,13 +540,17 @@ class PolylingualEmbeddingsClassifier:
|
||||||
tinit = time.time()
|
tinit = time.time()
|
||||||
langs = list(lX.keys())
|
langs = list(lX.keys())
|
||||||
WEtr, Ytr = [], []
|
WEtr, Ytr = [], []
|
||||||
self.fit_vectorizers(lX) # if already fit, does nothing
|
# self.fit_vectorizers(lX) # if already fit, does nothing
|
||||||
for lang in langs:
|
self.vectorize(lX)
|
||||||
WEtr.append(self.embed(lX[lang], lang))
|
# config = {'unsupervised' : False, 'supervised': True}
|
||||||
Ytr.append(ly[lang])
|
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||||
|
WEtr = self.embedding_space.predict(self.config, lX)
|
||||||
|
# for lang in langs:
|
||||||
|
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
|
||||||
|
# Ytr.append(ly[lang])
|
||||||
|
|
||||||
WEtr = np.vstack(WEtr)
|
WEtr = np.vstack([WEtr[lang] for lang in langs])
|
||||||
Ytr = np.vstack(Ytr)
|
Ytr = np.vstack([ly[lang] for lang in langs])
|
||||||
self.embed_time = time.time() - tinit
|
self.embed_time = time.time() - tinit
|
||||||
|
|
||||||
print('fitting the WE-space of shape={}'.format(WEtr.shape))
|
print('fitting the WE-space of shape={}'.format(WEtr.shape))
|
||||||
|
|
@ -535,8 +564,10 @@ class PolylingualEmbeddingsClassifier:
|
||||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||||
"""
|
"""
|
||||||
assert self.model is not None, 'predict called before fit'
|
assert self.model is not None, 'predict called before fit'
|
||||||
|
self.vectorize(lX, prediction=True)
|
||||||
langs = list(lX.keys())
|
langs = list(lX.keys())
|
||||||
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
lWEte = self.embedding_space.predict(self.config, lX)
|
||||||
|
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||||
return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs)
|
return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs)
|
||||||
|
|
||||||
def predict_proba(self, lX):
|
def predict_proba(self, lX):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue