reworked unsupervised (aligned) embeddings loader method and class (fastText and MUSE).
new op.arg -t ['MUSE', 'FastText'] uploaded /results/results.csv (on rcv1 ... run0.pickle) obtained on all available setup. TODO: refactor it also as a standalone class with its own load/weighted sum/extract/reduce methods.
This commit is contained in:
parent
499c6018c0
commit
f2083bf22a
|
|
@ -17,11 +17,14 @@ parser.add_option("-o", "--output", dest="output",
|
||||||
help="Result file", type=str, default='./results/results.csv')
|
help="Result file", type=str, default='./results/results.csv')
|
||||||
|
|
||||||
parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
||||||
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
|
||||||
|
|
||||||
parser.add_option("-w", "--we-path", dest="we_path",
|
parser.add_option("-w", "--we-path", dest="we_path",
|
||||||
help="Path to the polylingual word embeddings", default='../embeddings/')
|
help="Path to the polylingual word embeddings", default='../embeddings/')
|
||||||
|
|
||||||
|
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
|
||||||
|
default='FastText')
|
||||||
|
|
||||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||||
help="Set the C parameter", default=1)
|
help="Set the C parameter", default=1)
|
||||||
|
|
||||||
|
|
@ -36,7 +39,7 @@ def get_learner(calibrate=False, kernel='linear'):
|
||||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
|
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
|
||||||
|
|
||||||
|
|
||||||
def get_params(dense=False): # TODO kernel function could be useful for meta-classifier
|
def get_params(dense=False):
|
||||||
if not op.optimc:
|
if not op.optimc:
|
||||||
return None
|
return None
|
||||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||||
|
|
@ -72,30 +75,36 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
# Embeddings and WCE config
|
# Embeddings and WCE config
|
||||||
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
|
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
|
||||||
assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}'
|
_available_type = ['MUSE', 'FastText']
|
||||||
|
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
|
||||||
|
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
|
||||||
|
|
||||||
if op.mode_embed == 'none':
|
if op.mode_embed == 'none':
|
||||||
config = {'unsupervised': False,
|
config = {'unsupervised': False,
|
||||||
'supervised': False}
|
'supervised': False,
|
||||||
|
'we_type': None}
|
||||||
_config_id = 'None'
|
_config_id = 'None'
|
||||||
elif op.mode_embed == 'unsupervised':
|
elif op.mode_embed == 'unsupervised':
|
||||||
config = {'unsupervised': True,
|
config = {'unsupervised': True,
|
||||||
'supervised': False}
|
'supervised': False,
|
||||||
|
'we_type': op.we_type}
|
||||||
_config_id = 'M'
|
_config_id = 'M'
|
||||||
elif op.mode_embed == 'supervised':
|
elif op.mode_embed == 'supervised':
|
||||||
config = {'unsupervised': False,
|
config = {'unsupervised': False,
|
||||||
'supervised': True}
|
'supervised': True,
|
||||||
|
'we_type': None}
|
||||||
_config_id = 'F'
|
_config_id = 'F'
|
||||||
elif op.mode_embed == 'both':
|
elif op.mode_embed == 'both':
|
||||||
config = {'unsupervised': True,
|
config = {'unsupervised': True,
|
||||||
'supervised': True}
|
'supervised': True,
|
||||||
|
'we_type': op.we_type}
|
||||||
_config_id = 'M_and_F'
|
_config_id = 'M_and_F'
|
||||||
|
|
||||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||||
|
|
||||||
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
||||||
classifier = AndreaCLF(op.we_path,
|
classifier = AndreaCLF(we_path=op.we_path,
|
||||||
config,
|
config=config,
|
||||||
first_tier_learner=get_learner(calibrate=True),
|
first_tier_learner=get_learner(calibrate=True),
|
||||||
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||||
first_tier_parameters=get_params(dense=False),
|
first_tier_parameters=get_params(dense=False),
|
||||||
|
|
@ -114,5 +123,5 @@ if __name__ == '__main__':
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1],
|
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1],
|
||||||
'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
|
'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '')
|
||||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||||
|
|
|
||||||
|
|
@ -147,7 +147,7 @@ class FastTextWikiNews(Vectors):
|
||||||
|
|
||||||
url_base = 'Cant auto-download MUSE embeddings'
|
url_base = 'Cant auto-download MUSE embeddings'
|
||||||
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
||||||
_name = 'wiki.multi.{}.vec'
|
_name = '/embeddings/wiki.multi.{}.vec'
|
||||||
|
|
||||||
def __init__(self, cache, language="en", **kwargs):
|
def __init__(self, cache, language="en", **kwargs):
|
||||||
url = self.url_base.format(language)
|
url = self.url_base.format(language)
|
||||||
|
|
@ -157,6 +157,30 @@ class FastTextWikiNews(Vectors):
|
||||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingsAligned(Vectors):
|
||||||
|
|
||||||
|
def __init__(self, type, path, lang):
|
||||||
|
|
||||||
|
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
|
||||||
|
# todo - rewrite as relative path
|
||||||
|
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
|
||||||
|
self.path = path + self.name.format(lang)
|
||||||
|
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
|
||||||
|
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
|
||||||
|
|
||||||
|
def vocabulary(self):
|
||||||
|
return set(self.stoi.keys())
|
||||||
|
|
||||||
|
def dim(self):
|
||||||
|
return self.dim
|
||||||
|
|
||||||
|
def extract(self, words):
|
||||||
|
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
|
||||||
|
extraction = torch.zeros((len(words), self.dim))
|
||||||
|
extraction[source_idx] = self.vectors[target_idx]
|
||||||
|
return extraction
|
||||||
|
|
||||||
|
|
||||||
class FastTextMUSE(PretrainedEmbeddings):
|
class FastTextMUSE(PretrainedEmbeddings):
|
||||||
|
|
||||||
def __init__(self, path, lang, limit=None):
|
def __init__(self, path, lang, limit=None):
|
||||||
|
|
@ -179,12 +203,12 @@ class FastTextMUSE(PretrainedEmbeddings):
|
||||||
return extraction
|
return extraction
|
||||||
|
|
||||||
|
|
||||||
def embedding_matrix(path, voc, lang):
|
def embedding_matrix(type, path, voc, lang):
|
||||||
vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
|
vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
|
||||||
|
|
||||||
print('[embedding matrix]')
|
print('[embedding matrix]')
|
||||||
print(f'# [pretrained-matrix: FastTextMUSE {lang}]')
|
print(f'# [pretrained-matrix: {type} {lang}]')
|
||||||
pretrained = FastTextMUSE(path, lang)
|
pretrained = EmbeddingsAligned(type, path, lang)
|
||||||
P = pretrained.extract(vocabulary).numpy()
|
P = pretrained.extract(vocabulary).numpy()
|
||||||
del pretrained
|
del pretrained
|
||||||
print(f'[embedding matrix done] of shape={P.shape}\n')
|
print(f'[embedding matrix done] of shape={P.shape}\n')
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,6 @@ from sklearn.model_selection import GridSearchCV
|
||||||
from sklearn.model_selection import KFold
|
from sklearn.model_selection import KFold
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
from data.supervised import zscores
|
|
||||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -444,7 +442,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
first_tier_parameters=None,
|
first_tier_parameters=None,
|
||||||
meta_parameters=None,
|
meta_parameters=None,
|
||||||
folded_projections=1,
|
folded_projections=1,
|
||||||
calmode='cal', n_jobs=-1):
|
calmode='cal',
|
||||||
|
n_jobs=-1):
|
||||||
|
|
||||||
super().__init__(first_tier_learner,
|
super().__init__(first_tier_learner,
|
||||||
meta_learner,
|
meta_learner,
|
||||||
|
|
@ -479,9 +478,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
self.languages.append(lang)
|
self.languages.append(lang)
|
||||||
tfidf_vectorizer.fit(lX[lang])
|
tfidf_vectorizer.fit(lX[lang])
|
||||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||||
_sort_if_sparse(lX[lang])
|
|
||||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||||
self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing
|
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||||
return self
|
return self
|
||||||
|
|
||||||
# @override std class method
|
# @override std class method
|
||||||
|
|
@ -517,15 +515,13 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
|
|
||||||
if unsupervised:
|
if unsupervised:
|
||||||
for lang in languages:
|
for lang in languages:
|
||||||
# print('Test building embedding matrix FastTextMuse ...')
|
_, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
|
||||||
_, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang)
|
|
||||||
self.word_embeddings[lang] = M
|
self.word_embeddings[lang] = M
|
||||||
_r[lang] = lX[lang].dot(M)
|
_r[lang] = lX[lang].dot(M)
|
||||||
|
|
||||||
if supervised:
|
if supervised:
|
||||||
for lang in languages:
|
for lang in languages:
|
||||||
S = WCE_matrix(lX, ly, lang)
|
S = WCE_matrix(lX, ly, lang)
|
||||||
# S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
|
|
||||||
self.supervised_embeddings[lang] = S
|
self.supervised_embeddings[lang] = S
|
||||||
if unsupervised:
|
if unsupervised:
|
||||||
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
||||||
|
|
@ -562,7 +558,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||||
|
|
||||||
self.standardizer = StandardizeTransformer()
|
self.standardizer = StandardizeTransformer()
|
||||||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||||
|
|
||||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,55 @@
|
||||||
id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes
|
id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes
|
||||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 it 0.5367684112761455 0.7945344129554656 0.5179685773363333 0.7651326488894972 nope
|
||||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 pt 0.6969974938193201 0.878625134264232 0.6967392557377021 0.8466030321042095 nope
|
||||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 sv 0.502213941379271 0.7700107543401444 0.4991078326315248 0.7207899075774371 nope
|
||||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 es 0.5817849682843411 0.8448214916931778 0.5849433134898768 0.8202407220651875 nope
|
||||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 en 0.5284100314545743 0.7625649913344887 0.4968119038332687 0.7152142337789349 nope
|
||||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 da 0.4868904596668941 0.7971705872676427 0.4554442856126113 0.741227149968307 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 nl 0.5470546398570723 0.8276762402088773 0.5177281560038681 0.7850292121533595 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 fr 0.4997574965766772 0.7678434382194935 0.4836027981945328 0.7099957841328215 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 161.99278807640076 de 0.4220457399934653 0.7444316119452236 0.4256936056238835 0.7167749374918141 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 it 0.5398437760931379 0.8008933172994331 0.5146465197929204 0.7584451610463148 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 pt 0.6975279233747671 0.8779959377115775 0.6911573032014029 0.8392738059784555 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 sv 0.5179339368901748 0.7752035065748278 0.4962165022301373 0.7133720895906155 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 es 0.5745246656272296 0.8476464247215235 0.5736797442258523 0.8104027280076678 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 en 0.5265892627601801 0.761854398025736 0.4868823643967914 0.7032312369952987 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 da 0.4857267508065667 0.7955911823647295 0.449682467737542 0.7293013090493592 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 nl 0.5461000743929812 0.8304711580801409 0.5139887576564601 0.7790659402231745 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 fr 0.5015991524998897 0.7699748500677114 0.4811739320459739 0.7065159928392686 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 503.81587314605713 de 0.4141396160516795 0.743810005053057 0.4126132681585116 0.7023983497130937 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 it 0.4810224709403544 0.7617194410047762 0.453310215598049 0.6999032557458222 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 pt 0.6693663195289151 0.8619702956806105 0.6657298472047529 0.8182397742327547 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 sv 0.43107388787211537 0.7126933954416902 0.4180735239763325 0.6168407376537499 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 es 0.5087201120140917 0.8249322493224932 0.5032299168859704 0.7835086748116167 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 en 0.3822498549987095 0.6877811094452774 0.3309945723997902 0.5962925522774631 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 da 0.4517051377915163 0.7658914728682171 0.4030339299921389 0.6806166833916132 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 nl 0.4875303727964308 0.7853962600178095 0.4534046979963794 0.7270844266398626 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 fr 0.3750315407356979 0.6999393816932714 0.3628389019101708 0.6136670285424017 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_F_optimC PolyEmbed_andrea svm F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1095.333437204361 de 0.355059356514748 0.7046466085098807 0.33834564366266284 0.6299245108196094 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 it 0.4755443069888554 0.7675079985780305 0.4501140447119437 0.7023435117413848 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 pt 0.673303227450142 0.8655002733734279 0.6702445967772233 0.8193963705153853 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 sv 0.4189470089118392 0.7236711786068009 0.4198491651634073 0.6314272037990425 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 es 0.5178080058189616 0.8268359020852222 0.5104336022388637 0.782714898784318 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 en 0.4115752894185112 0.7001869158878504 0.35164720517285003 0.6091191993104883 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 da 0.4437869429842064 0.7626499739175796 0.39704879178312197 0.6717100410826179 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 nl 0.47635948919429705 0.7874471399955486 0.4589309165206792 0.7292337019755739 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 fr 0.39374621795002507 0.7063947733122155 0.3850407928528449 0.6315594797194366 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.picklePolyEmbedd_andrea_M_and_F_optimC PolyEmbed_andrea svm M_and_F True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle not_binary not_ablation 1251.0414910316467 de 0.3539890425069821 0.7095981751184418 0.3512802070446796 0.6432196317592322 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 it 0.5791455159341481 0.8060849214309596 0.6034752340075125 0.7869853576681214 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 pt 0.6403974389994276 0.8803876562101505 0.6565213830246649 0.8497743924811387 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 sv 0.5032337014290953 0.7768595041322314 0.4719549200388494 0.7364733997369779 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 es 0.5200567247634353 0.8529964145466963 0.4908726477090496 0.8285929531854332 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 en 0.512424485488998 0.7533647963642719 0.4719843960571978 0.7044441169169227 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 da 0.5861231569852233 0.8040595842200032 0.5393761149602847 0.7381233055764151 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 nl 0.6072184716496147 0.8335123523093448 0.5845309357041368 0.8020267337813639 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 fr 0.4923294612439038 0.7854697603651578 0.4713782273939219 0.7329001302478475 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_None_optimC PolyEmbed_andrea svm None True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 161.2168996334076 de 0.4709904181031267 0.7457793804294378 0.4465581491449931 0.7046844416244138 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 it 0.575387626645539 0.8064243448858833 0.5958411838194531 0.7790018114269683 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 pt 0.653004040098633 0.8791937747161628 0.6559210761775208 0.8482450061614855 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 sv 0.49944915222086167 0.7789179104477612 0.4604673876743342 0.727778938054739 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 es 0.5144474487169811 0.8559087767795439 0.48397711649967695 0.8222692824953204 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 en 0.5160737755179508 0.755674709562109 0.45961112517260677 0.6921096138985132 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 da 0.5875776383868945 0.8015873015873016 0.5367286265015276 0.7288571047461061 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 nl 0.6079883230969934 0.8363004776378636 0.5828217771858487 0.7968282071156207 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 fr 0.4966338770370634 0.7860696517412935 0.46250527724325174 0.7292650668002159 nope
|
||||||
|
rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.picklePolyEmbedd_andrea_M_optimC PolyEmbed_andrea svm M True rcv1-2_doclist_trByLang1000_teByLang1000_processed_run1.pickle not_binary not_ablation 497.6823613643646 de 0.4675732669000923 0.7479187479187479 0.43767984457683634 0.69653035770654 nope
|
||||||
|
|
|
||||||
|
|
|
@ -20,4 +20,4 @@ class StandardizeTransformer:
|
||||||
return (X - self.mean) / self.std
|
return (X - self.mean) / self.std
|
||||||
|
|
||||||
def fit_predict(self, X):
|
def fit_predict(self, X):
|
||||||
return self.fit(X).predict(X)
|
return self.fit(X).predict(X)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue