diff --git a/main.py b/main.py index cff4887..4e4fbc5 100644 --- a/main.py +++ b/main.py @@ -15,9 +15,21 @@ def main(args): print('Running generalized funnelling...') data = MultilingualDataset.load(args.dataset) - # data.set_view(languages=['it', 'da']) + data.set_view(languages=['it', 'da']) data.show_dimensions() lX, ly = data.training() + + # Testing zero shot experiments + # zero_shot_setting = True + # if zero_shot_setting: + # # _lX = {} + # _ly = {} + # train_langs = ['it'] + # for train_lang in train_langs: + # # _lX[train_lang] = lX[train_lang] + # _ly[train_lang] = ly[train_lang] + # ly = _ly + lXte, lyte = data.test() # Init multilingualIndex - mandatory when deploying Neural View Generators... @@ -33,7 +45,7 @@ def main(args): embedder_list.append(posteriorEmbedder) if args.muse_embedder: - museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs) + museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs, zero_shot=True) embedder_list.append(museEmbedder) if args.wce_embedder: @@ -99,7 +111,7 @@ def main(args): microf1=microf1, macrok=macrok, microk=microk, - notes='') + notes=f'Train langs: {sorted(lX.keys())}') print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) overall_time = round(time.time() - time_init, 3) @@ -112,8 +124,8 @@ if __name__ == '__main__': parser.add_argument('dataset', help='Path to the dataset') parser.add_argument('-o', '--output', dest='csv_dir', metavar='', - help='Result file (default ../csv_logs/gfun/gfun_results.csv)', type=str, - default='../csv_logs/gfun/gfun_results.csv') + help='Result file (default csv_logs/gfun/gfun_results.csv)', type=str, + default='csv_logs/gfun/gfun_results.csv') parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', help='deploy posterior probabilities embedder to compute document embeddings', diff --git a/run.sh b/run.sh index fd7f4f0..09ce599 100644 --- a/run.sh +++ b/run.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash -python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -g --gpus 0 +echo Running Zero-shot experiments [output at csv_logs/gfun/zero_shot_gfun.csv] + +python main.py /home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -m -o csv_logs/gfun/zero_shot_gfun.csv --gpus 0 #for i in {0..10..1} #do diff --git a/src/util/common.py b/src/util/common.py index 913014c..9f44273 100644 --- a/src/util/common.py +++ b/src/util/common.py @@ -378,7 +378,7 @@ def get_method_name(args): for i, conf in enumerate(_id_conf): if conf: _id += _id_name[i] - _id = _id if not args.gru_wce else _id + '_wce' + _id = _id if not args.rnn_wce else _id + '_wce' _dataset_path = args.dataset.split('/')[-1].split('_') dataset_id = _dataset_path[0] + _dataset_path[-1] return _id, dataset_id diff --git a/src/view_generators.py b/src/view_generators.py index af4ee8e..f8bf289 100644 --- a/src/view_generators.py +++ b/src/view_generators.py @@ -99,7 +99,7 @@ class MuseGen(ViewGen): View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. """ - def __init__(self, muse_dir='../embeddings', n_jobs=-1): + def __init__(self, muse_dir='../embeddings', zero_shot=False, n_jobs=-1): """ Init the MuseGen. :param muse_dir: string, path to folder containing muse embeddings @@ -111,6 +111,7 @@ class MuseGen(ViewGen): self.langs = None self.lMuse = None self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + self.zero_shot = zero_shot def fit(self, lX, ly): """ @@ -135,16 +136,34 @@ class MuseGen(ViewGen): :param lX: dict {lang: indexed documents} :return: document projection to the common latent space. """ - lX = self.vectorizer.transform(lX) + # Testing zero-shot experiments + if self.zero_shot: + lX = {l: self.vectorizer.vectorizer[l].transform(lX[l]) for l in self.langs if lX[l] is not None} + else: + lX = self.vectorizer.transform(lX) XdotMUSE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) - lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)} + delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in sorted(lX.keys())) + lZ = {lang: XdotMUSE[i] for i, lang in enumerate(sorted(lX.keys()))} lZ = _normalize(lZ, l2=True) return lZ def fit_transform(self, lX, ly): + print('## NB: Calling fit_transform!') + if self.zero_shot: + return self.fit(lX, ly).transform(self.zero_shot_experiments(lX)) return self.fit(lX, ly).transform(lX) + def zero_shot_experiments(self, lX, train_langs: list = ['it']): + print(f'# Zero-shot setting! Training langs will be set to: {sorted(train_langs)}') + _lX = {} + for lang in self.langs: + if lang in train_langs: + _lX[lang] = lX[lang] + else: + _lX[lang] = None + lX = _lX + return lX + class WordClassGen(ViewGen): """