refactor: added MUSE to learning/transformers.py
This commit is contained in:
parent
9bf1986402
commit
c14e8226b1
|
@ -148,7 +148,7 @@ class FastTextWikiNews(Vectors):
|
||||||
|
|
||||||
url_base = 'Cant auto-download MUSE embeddings'
|
url_base = 'Cant auto-download MUSE embeddings'
|
||||||
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
||||||
_name = '/embeddings/wiki.multi.{}.vec'
|
_name = '/wiki.multi.{}.vec'
|
||||||
|
|
||||||
def __init__(self, cache, language="en", **kwargs):
|
def __init__(self, cache, language="en", **kwargs):
|
||||||
url = self.url_base.format(language)
|
url = self.url_base.format(language)
|
||||||
|
@ -156,6 +156,7 @@ class FastTextWikiNews(Vectors):
|
||||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class EmbeddingsAligned(Vectors):
|
class EmbeddingsAligned(Vectors):
|
||||||
|
|
||||||
def __init__(self, type, path, lang, voc):
|
def __init__(self, type, path, lang, voc):
|
||||||
|
@ -186,10 +187,11 @@ class FastTextMUSE(PretrainedEmbeddings):
|
||||||
|
|
||||||
def __init__(self, path, lang, limit=None):
|
def __init__(self, path, lang, limit=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
print(f'Loading fastText pretrained vectors from {path}')
|
print(f'Loading fastText pretrained vectors for language {lang} from {path}')
|
||||||
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
||||||
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
||||||
|
|
||||||
|
|
||||||
def vocabulary(self):
|
def vocabulary(self):
|
||||||
return set(self.embed.stoi.keys())
|
return set(self.embed.stoi.keys())
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,103 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
import torch, torchtext
|
||||||
|
import gensim
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class KeyedVectors:
|
||||||
|
|
||||||
|
def __init__(self, word2index, weights):
|
||||||
|
assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
|
||||||
|
index2word = {i:w for w,i in word2index.items()}
|
||||||
|
assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
|
||||||
|
self.word2index = word2index
|
||||||
|
self.index2word = index2word
|
||||||
|
self.weights = weights
|
||||||
|
|
||||||
|
def extract(self, words):
|
||||||
|
dim = self.weights.shape[1]
|
||||||
|
v_size = len(words)
|
||||||
|
|
||||||
|
source_idx, target_idx = [], []
|
||||||
|
for i,word in enumerate(words):
|
||||||
|
if word not in self.word2index: continue
|
||||||
|
j = self.word2index[word]
|
||||||
|
source_idx.append(i)
|
||||||
|
target_idx.append(j)
|
||||||
|
|
||||||
|
extraction = np.zeros((v_size, dim))
|
||||||
|
extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
|
||||||
|
|
||||||
|
return extraction
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class PretrainedEmbeddings(ABC):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def vocabulary(self): pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def dim(self): pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def reindex(cls, words, word2index):
|
||||||
|
source_idx, target_idx = [], []
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
if word not in word2index: continue
|
||||||
|
j = word2index[word]
|
||||||
|
source_idx.append(i)
|
||||||
|
target_idx.append(j)
|
||||||
|
source_idx = np.asarray(source_idx)
|
||||||
|
target_idx = np.asarray(target_idx)
|
||||||
|
return source_idx, target_idx
|
||||||
|
|
||||||
|
|
||||||
|
class GloVe(PretrainedEmbeddings):
|
||||||
|
|
||||||
|
def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
|
||||||
|
super().__init__()
|
||||||
|
print(f'Loading GloVe pretrained vectors from torchtext')
|
||||||
|
self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
|
||||||
|
print('Done')
|
||||||
|
|
||||||
|
def vocabulary(self):
|
||||||
|
return set(self.embed.stoi.keys())
|
||||||
|
|
||||||
|
def dim(self):
|
||||||
|
return self.embed.dim
|
||||||
|
|
||||||
|
def extract(self, words):
|
||||||
|
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||||
|
extraction = torch.zeros((len(words), self.dim()))
|
||||||
|
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||||
|
return extraction
|
||||||
|
|
||||||
|
|
||||||
|
class Word2Vec(PretrainedEmbeddings):
|
||||||
|
|
||||||
|
def __init__(self, path, limit=None):
|
||||||
|
super().__init__()
|
||||||
|
print(f'Loading word2vec pretrained vectors from {path}')
|
||||||
|
assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
|
||||||
|
self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
|
||||||
|
self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
|
||||||
|
print('Done')
|
||||||
|
|
||||||
|
def vocabulary(self):
|
||||||
|
return set(self.word2index.keys())
|
||||||
|
|
||||||
|
def dim(self):
|
||||||
|
return self.embed.vector_size
|
||||||
|
|
||||||
|
def extract(self, words):
|
||||||
|
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
|
||||||
|
extraction = np.zeros((len(words), self.dim()))
|
||||||
|
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||||
|
extraction = torch.from_numpy(extraction).float()
|
||||||
|
return extraction
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
#from data.text_preprocessor import NLTKStemTokenizer
|
#from data.text_preprocessor import NLTKStemTokenizer
|
||||||
|
from embeddings.embeddings import FastTextMUSE
|
||||||
from embeddings.supervised import supervised_embeddings_tfidf, zscores
|
from embeddings.supervised import supervised_embeddings_tfidf, zscores
|
||||||
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
|
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
|
||||||
import time
|
import time
|
||||||
|
@ -53,7 +54,7 @@ class PosteriorProbabilitiesEmbedder:
|
||||||
self.fist_tier_parameters,
|
self.fist_tier_parameters,
|
||||||
n_jobs=n_jobs)
|
n_jobs=n_jobs)
|
||||||
|
|
||||||
def fit(self, lX, lY):
|
def fit(self, lX, lY, lV=None):
|
||||||
print('fitting the projectors... {}'.format(lX.keys()))
|
print('fitting the projectors... {}'.format(lX.keys()))
|
||||||
self.doc_projector.fit(lX, lY)
|
self.doc_projector.fit(lX, lY)
|
||||||
return self
|
return self
|
||||||
|
@ -63,20 +64,45 @@ class PosteriorProbabilitiesEmbedder:
|
||||||
lZ = self.doc_projector.predict_proba(lX)
|
lZ = self.doc_projector.predict_proba(lX)
|
||||||
return lZ
|
return lZ
|
||||||
|
|
||||||
def fit_transform(self, lX, ly=None):
|
def fit_transform(self, lX, ly=None, lV=None):
|
||||||
return self.fit(lX, ly).transform(lX)
|
return self.fit(lX, ly).transform(lX)
|
||||||
|
|
||||||
def best_params(self):
|
def best_params(self):
|
||||||
return self.doc_projector.best_params()
|
return self.doc_projector.best_params()
|
||||||
|
|
||||||
|
|
||||||
|
class MuseEmbedder:
|
||||||
|
|
||||||
|
def __init__(self, path, n_jobs=-1):
|
||||||
|
self.path=path
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
|
def fit(self, lX, ly, lV):
|
||||||
|
self.langs = sorted(lX.keys())
|
||||||
|
MUSE = Parallel(n_jobs=self.n_jobs)(
|
||||||
|
delayed(FastTextMUSE)(self.path, lang) for lang in self.langs
|
||||||
|
)
|
||||||
|
self.MUSE = {l:MUSE[i].extract(lV[l]).numpy() for i,l in enumerate(self.langs)}
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, lX):
|
||||||
|
MUSE = self.MUSE
|
||||||
|
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||||
|
delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs
|
||||||
|
)
|
||||||
|
return {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
||||||
|
|
||||||
|
def fit_transform(self, lX, ly, lV):
|
||||||
|
return self.fit(lX, ly, lV).transform(lX)
|
||||||
|
|
||||||
|
|
||||||
class WordClassEmbedder:
|
class WordClassEmbedder:
|
||||||
|
|
||||||
def __init__(self, n_jobs=-1, max_label_space=300):
|
def __init__(self, n_jobs=-1, max_label_space=300):
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
self.max_label_space=max_label_space
|
self.max_label_space=max_label_space
|
||||||
|
|
||||||
def fit(self, lX, ly):
|
def fit(self, lX, ly, lV=None):
|
||||||
self.langs = sorted(lX.keys())
|
self.langs = sorted(lX.keys())
|
||||||
WCE = Parallel(n_jobs=self.n_jobs)(
|
WCE = Parallel(n_jobs=self.n_jobs)(
|
||||||
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
|
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
|
||||||
|
@ -91,7 +117,7 @@ class WordClassEmbedder:
|
||||||
)
|
)
|
||||||
return {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
return {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
||||||
|
|
||||||
def fit_transform(self, lX, ly):
|
def fit_transform(self, lX, ly, lV=None):
|
||||||
return self.fit(lX, ly).transform(lX)
|
return self.fit(lX, ly).transform(lX)
|
||||||
|
|
||||||
|
|
||||||
|
@ -119,11 +145,13 @@ def XdotM(X,M):
|
||||||
|
|
||||||
class DocEmbedderList:
|
class DocEmbedderList:
|
||||||
def __init__(self, *embedder_list):
|
def __init__(self, *embedder_list):
|
||||||
|
if len(embedder_list)==0: embedder_list=[]
|
||||||
self.embedders = embedder_list
|
self.embedders = embedder_list
|
||||||
|
|
||||||
def fit(self, lX, ly):
|
|
||||||
|
def fit(self, lX, ly, lV):
|
||||||
for transformer in self.embedders:
|
for transformer in self.embedders:
|
||||||
transformer.fit(lX,ly)
|
transformer.fit(lX,ly,lV)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, lX):
|
def transform(self, lX):
|
||||||
|
@ -145,12 +173,15 @@ class DocEmbedderList:
|
||||||
return {l:hstacker(lZparts[l]) for l in langs}
|
return {l:hstacker(lZparts[l]) for l in langs}
|
||||||
|
|
||||||
|
|
||||||
def fit_transform(self, lX, ly):
|
def fit_transform(self, lX, ly, lV):
|
||||||
return self.fit(lX, ly).transform(lX)
|
return self.fit(lX, ly, lV).transform(lX)
|
||||||
|
|
||||||
def best_params(self):
|
def best_params(self):
|
||||||
return {'todo'}
|
return {'todo'}
|
||||||
|
|
||||||
|
def append(self, embedder):
|
||||||
|
self.embedders.append(embedder)
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Meta-Classifier
|
# Meta-Classifier
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
@ -200,7 +231,8 @@ class Funnelling:
|
||||||
|
|
||||||
def fit(self, lX, ly):
|
def fit(self, lX, ly):
|
||||||
lX = self.vectorizer.fit_transform(lX, ly)
|
lX = self.vectorizer.fit_transform(lX, ly)
|
||||||
lZ = self.first_tier.fit_transform(lX, ly)
|
lV = self.vectorizer.vocabulary()
|
||||||
|
lZ = self.first_tier.fit_transform(lX, ly, lV)
|
||||||
self.meta.fit(lZ, ly)
|
self.meta.fit(lZ, ly)
|
||||||
|
|
||||||
def predict(self, lX, ly=None):
|
def predict(self, lX, ly=None):
|
||||||
|
|
|
@ -3,7 +3,7 @@ from dataset_builder import MultilingualDataset
|
||||||
# from learning.learners import *
|
# from learning.learners import *
|
||||||
from learning.learners import FunnellingMultimodal
|
from learning.learners import FunnellingMultimodal
|
||||||
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
|
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
|
||||||
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder
|
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder
|
||||||
from util.evaluation import *
|
from util.evaluation import *
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
from util.file import exists
|
from util.file import exists
|
||||||
|
@ -21,14 +21,17 @@ parser.add_option("-d", "--dataset", dest="dataset",
|
||||||
parser.add_option("-o", "--output", dest="output",
|
parser.add_option("-o", "--output", dest="output",
|
||||||
help="Result file", type=str, default='./results/results.csv')
|
help="Result file", type=str, default='./results/results.csv')
|
||||||
|
|
||||||
parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
parser.add_option("-P", "--probs", dest="probs", action='store_true',
|
||||||
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
|
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||||
|
|
||||||
|
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
||||||
|
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
|
||||||
|
|
||||||
|
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
|
||||||
|
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||||
|
|
||||||
parser.add_option("-w", "--we-path", dest="we_path",
|
parser.add_option("-w", "--we-path", dest="we_path",
|
||||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
|
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
|
||||||
|
|
||||||
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
|
|
||||||
default='MUSE')
|
|
||||||
|
|
||||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||||
help="Set the C parameter", default=1)
|
help="Set the C parameter", default=1)
|
||||||
|
@ -40,16 +43,12 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||||
|
|
||||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
|
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||||
"If set to 0 it will automatically search for the best number of components. "
|
|
||||||
"If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
|
|
||||||
default=300)
|
default=300)
|
||||||
|
|
||||||
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||||
help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||||
" If set to 0 it will automatically search for the best number of components", default=300)
|
# " If set to 0 it will automatically search for the best number of components", default=300)
|
||||||
|
|
||||||
parser.add_option("-l", dest="lang", type=str)
|
|
||||||
|
|
||||||
# parser.add_option("-a", dest="post_pca",
|
# parser.add_option("-a", dest="post_pca",
|
||||||
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
|
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
|
||||||
|
@ -57,13 +56,7 @@ parser.add_option("-l", dest="lang", type=str)
|
||||||
|
|
||||||
|
|
||||||
def get_learner(calibrate=False, kernel='linear'):
|
def get_learner(calibrate=False, kernel='linear'):
|
||||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1,
|
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
|
||||||
|
|
||||||
|
|
||||||
# class_weight='balanced',
|
|
||||||
|
|
||||||
|
|
||||||
gamma='auto')
|
|
||||||
|
|
||||||
|
|
||||||
def get_params(dense=False):
|
def get_params(dense=False):
|
||||||
|
@ -89,69 +82,32 @@ if __name__ == '__main__':
|
||||||
data = MultilingualDataset.load(op.dataset)
|
data = MultilingualDataset.load(op.dataset)
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
|
|
||||||
# data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
|
|
||||||
# data.set_view(languages=[op.lang])
|
|
||||||
# data.set_view(categories=list(range(10)))
|
|
||||||
lXtr, lytr = data.training()
|
lXtr, lytr = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
||||||
if op.set_c != -1:
|
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||||
meta_parameters = None
|
|
||||||
else:
|
|
||||||
meta_parameters = [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
|
||||||
|
|
||||||
# Embeddings and WCE config
|
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
|
||||||
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
|
|
||||||
_available_type = ['MUSE', 'FastText']
|
|
||||||
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
|
|
||||||
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
|
|
||||||
|
|
||||||
if op.mode_embed == 'none':
|
print(f'{result_id}')
|
||||||
config = {'unsupervised': False,
|
|
||||||
'supervised': False,
|
|
||||||
'we_type': None}
|
|
||||||
_config_id = 'None'
|
|
||||||
elif op.mode_embed == 'unsupervised':
|
|
||||||
config = {'unsupervised': True,
|
|
||||||
'supervised': False,
|
|
||||||
'we_type': op.we_type}
|
|
||||||
_config_id = 'M'
|
|
||||||
elif op.mode_embed == 'supervised':
|
|
||||||
config = {'unsupervised': False,
|
|
||||||
'supervised': True,
|
|
||||||
'we_type': None}
|
|
||||||
_config_id = 'F'
|
|
||||||
elif op.mode_embed == 'both':
|
|
||||||
config = {'unsupervised': True,
|
|
||||||
'supervised': True,
|
|
||||||
'we_type': op.we_type}
|
|
||||||
_config_id = 'M+F'
|
|
||||||
|
|
||||||
config['reduction'] = 'PCA'
|
|
||||||
config['max_label_space'] = op.max_labels_S
|
|
||||||
config['dim_reduction_unsupervised'] = op.max_labels_U
|
|
||||||
# config['post_pca'] = op.post_pca
|
|
||||||
# config['plot_covariance_matrices'] = True
|
|
||||||
|
|
||||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
|
||||||
|
|
||||||
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
|
||||||
# classifier = FunnellingMultimodal(we_path=op.we_path,
|
|
||||||
# config=config,
|
|
||||||
# first_tier_learner=get_learner(calibrate=True),
|
|
||||||
# meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
|
||||||
# first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
|
|
||||||
# meta_parameters=get_params(dense=True),
|
|
||||||
# n_jobs=op.n_jobs)
|
|
||||||
|
|
||||||
|
# text preprocessing
|
||||||
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||||
post_prob = PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)
|
|
||||||
wce_proj = WordClassEmbedder()
|
|
||||||
doc_embedder = DocEmbedderList(post_prob, wce_proj)
|
|
||||||
# doc_embedder = DocEmbedderList(post_prob)
|
|
||||||
meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True))
|
|
||||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
|
||||||
|
|
||||||
|
# document embedding modules
|
||||||
|
doc_embedder = DocEmbedderList()
|
||||||
|
if op.probs:
|
||||||
|
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
|
||||||
|
if op.supervised:
|
||||||
|
doc_embedder.append(WordClassEmbedder(max_label_space=op.max_labels_S))
|
||||||
|
if op.pretrained:
|
||||||
|
doc_embedder.append(MuseEmbedder(op.we_path))
|
||||||
|
|
||||||
|
# metaclassifier
|
||||||
|
meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True))
|
||||||
|
|
||||||
|
# ensembling the modules
|
||||||
|
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||||
|
|
||||||
print('# Fitting ...')
|
print('# Fitting ...')
|
||||||
classifier.fit(lXtr, lytr)
|
classifier.fit(lXtr, lytr)
|
||||||
|
@ -163,7 +119,7 @@ if __name__ == '__main__':
|
||||||
for lang in lXte.keys():
|
for lang in lXte.keys():
|
||||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||||
# (config['max_label_space'], classifier.best_components),
|
# (config['max_label_space'], classifier.best_components),
|
||||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||||
|
|
Loading…
Reference in New Issue