Set arguments in order to reproduce 'master' performances with Neural setting

This commit is contained in:
andrea 2021-02-02 11:23:55 +01:00
parent bca0b9ab7c
commit 10bed81916
4 changed files with 44 additions and 11 deletions

22
main.py
View File

@ -15,9 +15,21 @@ def main(args):
print('Running generalized funnelling...') print('Running generalized funnelling...')
data = MultilingualDataset.load(args.dataset) data = MultilingualDataset.load(args.dataset)
# data.set_view(languages=['it', 'da']) data.set_view(languages=['it', 'da'])
data.show_dimensions() data.show_dimensions()
lX, ly = data.training() lX, ly = data.training()
# Testing zero shot experiments
# zero_shot_setting = True
# if zero_shot_setting:
# # _lX = {}
# _ly = {}
# train_langs = ['it']
# for train_lang in train_langs:
# # _lX[train_lang] = lX[train_lang]
# _ly[train_lang] = ly[train_lang]
# ly = _ly
lXte, lyte = data.test() lXte, lyte = data.test()
# Init multilingualIndex - mandatory when deploying Neural View Generators... # Init multilingualIndex - mandatory when deploying Neural View Generators...
@ -33,7 +45,7 @@ def main(args):
embedder_list.append(posteriorEmbedder) embedder_list.append(posteriorEmbedder)
if args.muse_embedder: if args.muse_embedder:
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs) museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs, zero_shot=True)
embedder_list.append(museEmbedder) embedder_list.append(museEmbedder)
if args.wce_embedder: if args.wce_embedder:
@ -99,7 +111,7 @@ def main(args):
microf1=microf1, microf1=microf1,
macrok=macrok, macrok=macrok,
microk=microk, microk=microk,
notes='') notes=f'Train langs: {sorted(lX.keys())}')
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
overall_time = round(time.time() - time_init, 3) overall_time = round(time.time() - time_init, 3)
@ -112,8 +124,8 @@ if __name__ == '__main__':
parser.add_argument('dataset', help='Path to the dataset') parser.add_argument('dataset', help='Path to the dataset')
parser.add_argument('-o', '--output', dest='csv_dir', metavar='', parser.add_argument('-o', '--output', dest='csv_dir', metavar='',
help='Result file (default ../csv_logs/gfun/gfun_results.csv)', type=str, help='Result file (default csv_logs/gfun/gfun_results.csv)', type=str,
default='../csv_logs/gfun/gfun_results.csv') default='csv_logs/gfun/gfun_results.csv')
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
help='deploy posterior probabilities embedder to compute document embeddings', help='deploy posterior probabilities embedder to compute document embeddings',

4
run.sh
View File

@ -1,6 +1,8 @@
#!/usr/bin/env bash #!/usr/bin/env bash
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0 echo Running Zero-shot experiments [output at csv_logs/gfun/zero_shot_gfun.csv]
python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -m -o csv_logs/gfun/zero_shot_gfun.csv --gpus 0
#for i in {0..10..1} #for i in {0..10..1}
#do #do

View File

@ -378,7 +378,7 @@ def get_method_name(args):
for i, conf in enumerate(_id_conf): for i, conf in enumerate(_id_conf):
if conf: if conf:
_id += _id_name[i] _id += _id_name[i]
_id = _id if not args.gru_wce else _id + '_wce' _id = _id if not args.rnn_wce else _id + '_wce'
_dataset_path = args.dataset.split('/')[-1].split('_') _dataset_path = args.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1] dataset_id = _dataset_path[0] + _dataset_path[-1]
return _id, dataset_id return _id, dataset_id

View File

@ -99,7 +99,7 @@ class MuseGen(ViewGen):
View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
""" """
def __init__(self, muse_dir='../embeddings', n_jobs=-1): def __init__(self, muse_dir='../embeddings', zero_shot=False, n_jobs=-1):
""" """
Init the MuseGen. Init the MuseGen.
:param muse_dir: string, path to folder containing muse embeddings :param muse_dir: string, path to folder containing muse embeddings
@ -111,6 +111,7 @@ class MuseGen(ViewGen):
self.langs = None self.langs = None
self.lMuse = None self.lMuse = None
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
self.zero_shot = zero_shot
def fit(self, lX, ly): def fit(self, lX, ly):
""" """
@ -135,16 +136,34 @@ class MuseGen(ViewGen):
:param lX: dict {lang: indexed documents} :param lX: dict {lang: indexed documents}
:return: document projection to the common latent space. :return: document projection to the common latent space.
""" """
lX = self.vectorizer.transform(lX) # Testing zero-shot experiments
if self.zero_shot:
lX = {l: self.vectorizer.vectorizer[l].transform(lX[l]) for l in self.langs if lX[l] is not None}
else:
lX = self.vectorizer.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)( XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in sorted(lX.keys()))
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)} lZ = {lang: XdotMUSE[i] for i, lang in enumerate(sorted(lX.keys()))}
lZ = _normalize(lZ, l2=True) lZ = _normalize(lZ, l2=True)
return lZ return lZ
def fit_transform(self, lX, ly): def fit_transform(self, lX, ly):
print('## NB: Calling fit_transform!')
if self.zero_shot:
return self.fit(lX, ly).transform(self.zero_shot_experiments(lX))
return self.fit(lX, ly).transform(lX) return self.fit(lX, ly).transform(lX)
def zero_shot_experiments(self, lX, train_langs: list = ['it']):
print(f'# Zero-shot setting! Training langs will be set to: {sorted(train_langs)}')
_lX = {}
for lang in self.langs:
if lang in train_langs:
_lX[lang] = lX[lang]
else:
_lX[lang] = None
lX = _lX
return lX
class WordClassGen(ViewGen): class WordClassGen(ViewGen):
""" """