Set arguments in order to reproduce 'master' performances with Neural setting
This commit is contained in:
parent
bca0b9ab7c
commit
10bed81916
22
main.py
22
main.py
|
|
@ -15,9 +15,21 @@ def main(args):
|
||||||
print('Running generalized funnelling...')
|
print('Running generalized funnelling...')
|
||||||
|
|
||||||
data = MultilingualDataset.load(args.dataset)
|
data = MultilingualDataset.load(args.dataset)
|
||||||
# data.set_view(languages=['it', 'da'])
|
data.set_view(languages=['it', 'da'])
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
lX, ly = data.training()
|
lX, ly = data.training()
|
||||||
|
|
||||||
|
# Testing zero shot experiments
|
||||||
|
# zero_shot_setting = True
|
||||||
|
# if zero_shot_setting:
|
||||||
|
# # _lX = {}
|
||||||
|
# _ly = {}
|
||||||
|
# train_langs = ['it']
|
||||||
|
# for train_lang in train_langs:
|
||||||
|
# # _lX[train_lang] = lX[train_lang]
|
||||||
|
# _ly[train_lang] = ly[train_lang]
|
||||||
|
# ly = _ly
|
||||||
|
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
||||||
# Init multilingualIndex - mandatory when deploying Neural View Generators...
|
# Init multilingualIndex - mandatory when deploying Neural View Generators...
|
||||||
|
|
@ -33,7 +45,7 @@ def main(args):
|
||||||
embedder_list.append(posteriorEmbedder)
|
embedder_list.append(posteriorEmbedder)
|
||||||
|
|
||||||
if args.muse_embedder:
|
if args.muse_embedder:
|
||||||
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs)
|
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs, zero_shot=True)
|
||||||
embedder_list.append(museEmbedder)
|
embedder_list.append(museEmbedder)
|
||||||
|
|
||||||
if args.wce_embedder:
|
if args.wce_embedder:
|
||||||
|
|
@ -99,7 +111,7 @@ def main(args):
|
||||||
microf1=microf1,
|
microf1=microf1,
|
||||||
macrok=macrok,
|
macrok=macrok,
|
||||||
microk=microk,
|
microk=microk,
|
||||||
notes='')
|
notes=f'Train langs: {sorted(lX.keys())}')
|
||||||
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
|
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
|
||||||
|
|
||||||
overall_time = round(time.time() - time_init, 3)
|
overall_time = round(time.time() - time_init, 3)
|
||||||
|
|
@ -112,8 +124,8 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('dataset', help='Path to the dataset')
|
parser.add_argument('dataset', help='Path to the dataset')
|
||||||
|
|
||||||
parser.add_argument('-o', '--output', dest='csv_dir', metavar='',
|
parser.add_argument('-o', '--output', dest='csv_dir', metavar='',
|
||||||
help='Result file (default ../csv_logs/gfun/gfun_results.csv)', type=str,
|
help='Result file (default csv_logs/gfun/gfun_results.csv)', type=str,
|
||||||
default='../csv_logs/gfun/gfun_results.csv')
|
default='csv_logs/gfun/gfun_results.csv')
|
||||||
|
|
||||||
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
|
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
|
||||||
help='deploy posterior probabilities embedder to compute document embeddings',
|
help='deploy posterior probabilities embedder to compute document embeddings',
|
||||||
|
|
|
||||||
4
run.sh
4
run.sh
|
|
@ -1,6 +1,8 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0
|
echo Running Zero-shot experiments [output at csv_logs/gfun/zero_shot_gfun.csv]
|
||||||
|
|
||||||
|
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -m -o csv_logs/gfun/zero_shot_gfun.csv --gpus 0
|
||||||
|
|
||||||
#for i in {0..10..1}
|
#for i in {0..10..1}
|
||||||
#do
|
#do
|
||||||
|
|
|
||||||
|
|
@ -378,7 +378,7 @@ def get_method_name(args):
|
||||||
for i, conf in enumerate(_id_conf):
|
for i, conf in enumerate(_id_conf):
|
||||||
if conf:
|
if conf:
|
||||||
_id += _id_name[i]
|
_id += _id_name[i]
|
||||||
_id = _id if not args.gru_wce else _id + '_wce'
|
_id = _id if not args.rnn_wce else _id + '_wce'
|
||||||
_dataset_path = args.dataset.split('/')[-1].split('_')
|
_dataset_path = args.dataset.split('/')[-1].split('_')
|
||||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
return _id, dataset_id
|
return _id, dataset_id
|
||||||
|
|
|
||||||
|
|
@ -99,7 +99,7 @@ class MuseGen(ViewGen):
|
||||||
View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
|
View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
|
||||||
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||||
"""
|
"""
|
||||||
def __init__(self, muse_dir='../embeddings', n_jobs=-1):
|
def __init__(self, muse_dir='../embeddings', zero_shot=False, n_jobs=-1):
|
||||||
"""
|
"""
|
||||||
Init the MuseGen.
|
Init the MuseGen.
|
||||||
:param muse_dir: string, path to folder containing muse embeddings
|
:param muse_dir: string, path to folder containing muse embeddings
|
||||||
|
|
@ -111,6 +111,7 @@ class MuseGen(ViewGen):
|
||||||
self.langs = None
|
self.langs = None
|
||||||
self.lMuse = None
|
self.lMuse = None
|
||||||
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||||
|
self.zero_shot = zero_shot
|
||||||
|
|
||||||
def fit(self, lX, ly):
|
def fit(self, lX, ly):
|
||||||
"""
|
"""
|
||||||
|
|
@ -135,16 +136,34 @@ class MuseGen(ViewGen):
|
||||||
:param lX: dict {lang: indexed documents}
|
:param lX: dict {lang: indexed documents}
|
||||||
:return: document projection to the common latent space.
|
:return: document projection to the common latent space.
|
||||||
"""
|
"""
|
||||||
lX = self.vectorizer.transform(lX)
|
# Testing zero-shot experiments
|
||||||
|
if self.zero_shot:
|
||||||
|
lX = {l: self.vectorizer.vectorizer[l].transform(lX[l]) for l in self.langs if lX[l] is not None}
|
||||||
|
else:
|
||||||
|
lX = self.vectorizer.transform(lX)
|
||||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||||
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs)
|
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in sorted(lX.keys()))
|
||||||
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)}
|
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(sorted(lX.keys()))}
|
||||||
lZ = _normalize(lZ, l2=True)
|
lZ = _normalize(lZ, l2=True)
|
||||||
return lZ
|
return lZ
|
||||||
|
|
||||||
def fit_transform(self, lX, ly):
|
def fit_transform(self, lX, ly):
|
||||||
|
print('## NB: Calling fit_transform!')
|
||||||
|
if self.zero_shot:
|
||||||
|
return self.fit(lX, ly).transform(self.zero_shot_experiments(lX))
|
||||||
return self.fit(lX, ly).transform(lX)
|
return self.fit(lX, ly).transform(lX)
|
||||||
|
|
||||||
|
def zero_shot_experiments(self, lX, train_langs: list = ['it']):
|
||||||
|
print(f'# Zero-shot setting! Training langs will be set to: {sorted(train_langs)}')
|
||||||
|
_lX = {}
|
||||||
|
for lang in self.langs:
|
||||||
|
if lang in train_langs:
|
||||||
|
_lX[lang] = lX[lang]
|
||||||
|
else:
|
||||||
|
_lX[lang] = None
|
||||||
|
lX = _lX
|
||||||
|
return lX
|
||||||
|
|
||||||
|
|
||||||
class WordClassGen(ViewGen):
|
class WordClassGen(ViewGen):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue