diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py index 1121a58..da6ec92 100644 --- a/refactor/data/datamodule.py +++ b/refactor/data/datamodule.py @@ -88,14 +88,21 @@ class RecurrentDataset(Dataset): class RecurrentDataModule(pl.LightningDataModule): - def __init__(self, multilingualIndex, batchsize=64): + """ + Pytorch Lightning Datamodule to be deployed with RecurrentGen. + https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + """ + def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1): """ - Pytorch-lightning DataModule: https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html - :param multilingualIndex: - :param batchsize: + Init RecurrentDataModule. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batchsize: int, number of sample per batch. + :param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading). """ self.multilingualIndex = multilingualIndex self.batchsize = batchsize + self.n_jobs = n_jobs super().__init__() def prepare_data(self, *args, **kwargs): @@ -128,15 +135,15 @@ class RecurrentDataModule(pl.LightningDataModule): lPad_index=self.multilingualIndex.l_pad()) def train_dataloader(self): - return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, collate_fn=self.training_dataset.collate_fn) def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, collate_fn=self.val_dataset.collate_fn) def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, collate_fn=self.test_dataset.collate_fn) @@ -156,7 +163,18 @@ def tokenize(l_raw, max_len): class BertDataModule(RecurrentDataModule): + """ + Pytorch Lightning Datamodule to be deployed with BertGen. + https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + """ def __init__(self, multilingualIndex, batchsize=64, max_len=512): + """ + Init BertDataModule. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batchsize: int, number of sample per batch. + :param max_len: int, max number of token per document. Absolute cap is 512. + """ super().__init__(multilingualIndex, batchsize) self.max_len = max_len diff --git a/refactor/funnelling.py b/refactor/funnelling.py index 4d19e1a..812a937 100644 --- a/refactor/funnelling.py +++ b/refactor/funnelling.py @@ -4,9 +4,13 @@ from view_generators import VanillaFunGen class DocEmbedderList: + """ + Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be + contained by this class in order to seamlessly train the overall architecture. + """ def __init__(self, embedder_list, probabilistic=True): """ - Class that takes care of calling fit and transform function for every init embedder. + Init the DocEmbedderList. :param embedder_list: list of embedders to be deployed :param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not """ @@ -23,11 +27,22 @@ class DocEmbedderList: self.embedders = _tmp def fit(self, lX, ly): + """ + Fit all the ViewGenerators contained by DocEmbedderList. + :param lX: + :param ly: + :return: self + """ for embedder in self.embedders: embedder.fit(lX, ly) return self def transform(self, lX): + """ + Project documents by means of every ViewGenerators. Projections are then averaged together and returned. + :param lX: + :return: common latent space (averaged). + """ langs = sorted(lX.keys()) lZparts = {lang: None for lang in langs} @@ -40,14 +55,24 @@ class DocEmbedderList: else: lZparts[lang] += Z n_embedders = len(self.embedders) - return {lang: lZparts[lang]/n_embedders for lang in langs} + return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces def fit_transform(self, lX, ly): return self.fit(lX, ly).transform(lX) class FeatureSet2Posteriors: + """ + Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of + a multiclass SVM. + """ def __init__(self, embedder, l2=True, n_jobs=-1): + """ + Init the class. + :param embedder: ViewGen, view generators which does not natively outputs posterior probabilities. + :param l2: bool, whether to apply or not L2 normalization to the projection + :param n_jobs: int, number of concurrent workers. + """ self.embedder = embedder self.l2 = l2 self.n_jobs = n_jobs @@ -77,6 +102,11 @@ class FeatureSet2Posteriors: class Funnelling: + """ + Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders. + The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by + the first-tier learners. + """ def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1): self.first_tier = first_tier self.meta = meta_classifier diff --git a/refactor/main.py b/refactor/main.py index 48936d0..ebc43a3 100644 --- a/refactor/main.py +++ b/refactor/main.py @@ -26,6 +26,7 @@ def main(args): lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir) multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) + # Init ViewGenerators and append them to embedder_list embedder_list = [] if args.post_embedder: posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs) diff --git a/refactor/view_generators.py b/refactor/view_generators.py index 6cdd4a9..384ec76 100644 --- a/refactor/view_generators.py +++ b/refactor/view_generators.py @@ -30,6 +30,10 @@ from util.embeddings_manager import MuseLoader, XdotM, wce_matrix class ViewGen(ABC): + """ + Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to + be seamlessly integrated in the overall architecture. + """ @abstractmethod def fit(self, lX, ly): pass @@ -44,9 +48,13 @@ class ViewGen(ABC): class VanillaFunGen(ViewGen): + """ + View Generator (x): original funnelling architecture proposed by Moreo, Esuli and + Sebastiani in DOI: https://doi.org/10.1145/3326065 + """ def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): """ - Original funnelling architecture proposed by Moreo, Esuli and Sebastiani in DOI: https://doi.org/10.1145/3326065 + Init Posterior Probabilities embedder (i.e., VanillaFunGen) :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to return posterior probabilities. :param base_learner: @@ -68,11 +76,10 @@ class VanillaFunGen(ViewGen): def transform(self, lX): """ - (1) Vectorize documents - (2) Project them according to the learners SVMs - (3) Apply L2 normalization to the projection - :param lX: - :return: + (1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization + to the projection and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. """ lX = self.vectorizer.transform(lX) lZ = self.doc_projector.predict_proba(lX) @@ -84,10 +91,13 @@ class VanillaFunGen(ViewGen): class MuseGen(ViewGen): + """ + View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word + embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ def __init__(self, muse_dir='../embeddings', n_jobs=-1): """ - generates document representation via MUSE embeddings (Fasttext multilingual word - embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + Init the MuseGen. :param muse_dir: string, path to folder containing muse embeddings :param n_jobs: int, number of concurrent workers """ @@ -99,6 +109,12 @@ class MuseGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, ly): + """ + (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ print('# Fitting MuseGen (M)...') self.vectorizer.fit(lX) self.langs = sorted(lX.keys()) @@ -109,6 +125,12 @@ class MuseGen(ViewGen): return self def transform(self, lX): + """ + (1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level, + finally (3) Apply L2 normalization embedding and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ lX = self.vectorizer.transform(lX) XdotMUSE = Parallel(n_jobs=self.n_jobs)( delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) @@ -121,10 +143,13 @@ class MuseGen(ViewGen): class WordClassGen(ViewGen): + """ + View Generator (w): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + """ def __init__(self, n_jobs=-1): """ - generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. + Init WordClassGen. :param n_jobs: int, number of concurrent workers """ super().__init__() @@ -134,6 +159,12 @@ class WordClassGen(ViewGen): self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) def fit(self, lX, ly): + """ + (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ print('# Fitting WordClassGen (W)...') lX = self.vectorizer.fit_transform(lX) self.langs = sorted(lX.keys()) @@ -144,6 +175,12 @@ class WordClassGen(ViewGen): return self def transform(self, lX): + """ + (1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level, + finally (3) Apply L2 normalization embedding and returns it. + :param lX: dict {lang: indexed documents} + :return: document projection to the common latent space. + """ lX = self.vectorizer.transform(lX) XdotWce = Parallel(n_jobs=self.n_jobs)( delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs) @@ -156,17 +193,28 @@ class WordClassGen(ViewGen): class RecurrentGen(ViewGen): + """ + View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns + the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. + """ def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): """ - generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). - Output dimension is (n_docs, 512). - :param multilingualIndex: - :param pretrained_embeddings: - :param wce: - :param gpus: - :param n_jobs: + Init RecurrentGen. + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use + as embedding layer. + :param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised + embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to + the number of target classes. + :param batch_size: int, number of samples in a batch. + :param nepochs: int, number of max epochs to train the model. + :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading). + :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. """ super().__init__() self.multilingualIndex = multilingualIndex @@ -212,14 +260,15 @@ class RecurrentGen(ViewGen): def fit(self, lX, ly): """ + Train the Neural Network end-to-end. lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation of the Dataset object (RecurrentDataset) in the GfunDataModule class. - :param lX: - :param ly: - :return: + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. """ print('# Fitting RecurrentGen (G)...') - recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size) + recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, checkpoint_callback=False) @@ -236,9 +285,9 @@ class RecurrentGen(ViewGen): def transform(self, lX): """ - Project documents to the common latent space - :param lX: - :return: + Project documents to the common latent space. Output dimensionality is 512. + :param lX: dict {lang: indexed documents} + :return: documents projected to the common latent space. """ l_pad = self.multilingualIndex.l_pad() data = self.multilingualIndex.l_devel_index() @@ -255,7 +304,22 @@ class RecurrentGen(ViewGen): class BertGen(ViewGen): + """ + View Generator (b): generates document embedding via Bert model. The training happens end-to-end. + At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document + embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard. + """ def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): + """ + Init Bert model + :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents + indexed by language code. + :param batch_size: int, number of samples per batch. + :param nepochs: int, number of max epochs to train the model. + :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. + :param n_jobs: int, number of concurrent workers. + :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. + """ super().__init__() self.multilingualIndex = multilingualIndex self.nepochs = nepochs @@ -271,6 +335,14 @@ class BertGen(ViewGen): return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) def fit(self, lX, ly): + """ + Train the Neural Network end-to-end. + lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation + of the Dataset object (RecurrentDataset) in the GfunDataModule class. + :param lX: dict {lang: indexed documents} + :param ly: dict {lang: target vectors} + :return: self. + """ print('# Fitting BertGen (M)...') self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) @@ -281,7 +353,11 @@ class BertGen(ViewGen): return self def transform(self, lX): - # lX is raw text data. It has to be first indexed via Bert Tokenizer. + """ + Project documents to the common latent space. Output dimensionality is 768. + :param lX: dict {lang: indexed documents} + :return: documents projected to the common latent space. + """ data = self.multilingualIndex.l_devel_raw_index() data = tokenize(data, max_len=512) self.model.to('cuda' if self.gpus else 'cpu')