sketched out documentation

2021-01-26 14:02:51 +01:00 · 2021-01-26 14:02:51 +01:00 · 30d2be245c
parent 2a8075bbc2
commit 30d2be245c
4 changed files with 160 additions and 35 deletions
--- a/refactor/data/datamodule.py
+++ b/refactor/data/datamodule.py
@ -88,14 +88,21 @@ class RecurrentDataset(Dataset):


 class RecurrentDataModule(pl.LightningDataModule):
-    def __init__(self, multilingualIndex, batchsize=64):
+    """
+    Pytorch Lightning Datamodule to be deployed with RecurrentGen.
+    https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
+    """
+    def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1):
        """
-        Pytorch-lightning DataModule: https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
-        :param multilingualIndex:
-        :param batchsize:
+        Init RecurrentDataModule.
+        :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
+        indexed by language code.
+        :param batchsize: int, number of sample per batch.
+        :param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading).
        """
        self.multilingualIndex = multilingualIndex
        self.batchsize = batchsize
+        self.n_jobs = n_jobs
        super().__init__()

    def prepare_data(self, *args, **kwargs):
@ -128,15 +135,15 @@ class RecurrentDataModule(pl.LightningDataModule):
                                                 lPad_index=self.multilingualIndex.l_pad())

    def train_dataloader(self):
-        return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
+        return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
                          collate_fn=self.training_dataset.collate_fn)

    def val_dataloader(self):
-        return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
+        return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
                          collate_fn=self.val_dataset.collate_fn)

    def test_dataloader(self):
-        return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
+        return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
                          collate_fn=self.test_dataset.collate_fn)


@ -156,7 +163,18 @@ def tokenize(l_raw, max_len):


 class BertDataModule(RecurrentDataModule):
+    """
+    Pytorch Lightning Datamodule to be deployed with BertGen.
+    https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
+    """
    def __init__(self, multilingualIndex, batchsize=64, max_len=512):
+        """
+        Init BertDataModule.
+        :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
+        indexed by language code.
+        :param batchsize: int, number of sample per batch.
+        :param max_len: int, max number of token per document. Absolute cap is 512.
+        """
        super().__init__(multilingualIndex, batchsize)
        self.max_len = max_len

--- a/refactor/funnelling.py
+++ b/refactor/funnelling.py
@ -4,9 +4,13 @@ from view_generators import VanillaFunGen


 class DocEmbedderList:
+    """
+    Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be
+    contained by this class in order to seamlessly train the overall architecture.
+    """
    def __init__(self, embedder_list, probabilistic=True):
        """
-        Class that takes care of calling fit and transform function for every init embedder.
+        Init the DocEmbedderList.
        :param embedder_list: list of embedders to be deployed
        :param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not
        """
@ -23,11 +27,22 @@ class DocEmbedderList:
        self.embedders = _tmp

    def fit(self, lX, ly):
+        """
+        Fit all the ViewGenerators contained by DocEmbedderList.
+        :param lX:
+        :param ly:
+        :return: self
+        """
        for embedder in self.embedders:
            embedder.fit(lX, ly)
        return self

    def transform(self, lX):
+        """
+        Project documents by means of every ViewGenerators. Projections are then averaged together and returned.
+        :param lX:
+        :return: common latent space (averaged).
+        """
        langs = sorted(lX.keys())
        lZparts = {lang: None for lang in langs}

@ -40,14 +55,24 @@ class DocEmbedderList:
                else:
                    lZparts[lang] += Z
        n_embedders = len(self.embedders)
-        return {lang: lZparts[lang]/n_embedders for lang in langs}
+        return {lang: lZparts[lang]/n_embedders for lang in langs}  # Averaging feature spaces

    def fit_transform(self, lX, ly):
        return self.fit(lX, ly).transform(lX)


 class FeatureSet2Posteriors:
+    """
+    Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
+    a multiclass SVM.
+    """
    def __init__(self, embedder, l2=True, n_jobs=-1):
+        """
+        Init the class.
+        :param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
+        :param l2: bool, whether to apply or not L2 normalization to the projection
+        :param n_jobs: int, number of concurrent workers.
+        """
        self.embedder = embedder
        self.l2 = l2
        self.n_jobs = n_jobs
@ -77,6 +102,11 @@ class FeatureSet2Posteriors:


 class Funnelling:
+    """
+    Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders.
+    The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by
+    the first-tier learners.
+    """
    def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1):
        self.first_tier = first_tier
        self.meta = meta_classifier
--- a/refactor/main.py
+++ b/refactor/main.py
@ -26,6 +26,7 @@ def main(args):
        lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir)
        multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary())

+    # Init ViewGenerators and append them to embedder_list
    embedder_list = []
    if args.post_embedder:
        posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs)
--- a/refactor/view_generators.py
+++ b/refactor/view_generators.py
@ -30,6 +30,10 @@ from util.embeddings_manager import MuseLoader, XdotM, wce_matrix


 class ViewGen(ABC):
+    """
+    Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to
+    be seamlessly integrated in the overall architecture.
+    """
    @abstractmethod
    def fit(self, lX, ly):
        pass
@ -44,9 +48,13 @@ class ViewGen(ABC):


 class VanillaFunGen(ViewGen):
+    """
+    View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
+    Sebastiani in DOI: https://doi.org/10.1145/3326065
+    """
    def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
        """
-        Original funnelling architecture proposed by Moreo, Esuli and Sebastiani in DOI: https://doi.org/10.1145/3326065
+        Init Posterior Probabilities embedder (i.e., VanillaFunGen)
        :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to
        return posterior probabilities.
        :param base_learner:
@ -68,11 +76,10 @@ class VanillaFunGen(ViewGen):

    def transform(self, lX):
        """
-        (1) Vectorize documents
-        (2) Project them according to the learners SVMs
-        (3) Apply L2 normalization to the projection
-        :param lX:
-        :return:
+        (1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization
+        to the projection and returns it.
+        :param lX: dict {lang: indexed documents}
+        :return: document projection to the common latent space.
        """
        lX = self.vectorizer.transform(lX)
        lZ = self.doc_projector.predict_proba(lX)
@ -84,10 +91,13 @@ class VanillaFunGen(ViewGen):


 class MuseGen(ViewGen):
+    """
+    View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
+    embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
+    """
    def __init__(self, muse_dir='../embeddings', n_jobs=-1):
        """
-        generates document representation via MUSE embeddings (Fasttext multilingual word
-        embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
+        Init the MuseGen.
        :param muse_dir: string, path to folder containing muse embeddings
        :param n_jobs: int, number of concurrent workers
        """
@ -99,6 +109,12 @@ class MuseGen(ViewGen):
        self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)

    def fit(self, lX, ly):
+        """
+        (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
+        :param lX: dict {lang: indexed documents}
+        :param ly: dict {lang: target vectors}
+        :return: self.
+        """
        print('# Fitting MuseGen (M)...')
        self.vectorizer.fit(lX)
        self.langs = sorted(lX.keys())
@ -109,6 +125,12 @@ class MuseGen(ViewGen):
        return self

    def transform(self, lX):
+        """
+        (1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level,
+         finally (3) Apply L2 normalization embedding and returns it.
+        :param lX: dict {lang: indexed documents}
+        :return: document projection to the common latent space.
+        """
        lX = self.vectorizer.transform(lX)
        XdotMUSE = Parallel(n_jobs=self.n_jobs)(
            delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs)
@ -121,10 +143,13 @@ class MuseGen(ViewGen):


 class WordClassGen(ViewGen):
+    """
+    View Generator (w): generates document representation via Word-Class-Embeddings.
+    Document embeddings are obtained via weighted sum of document's constituent embeddings.
+    """
    def __init__(self, n_jobs=-1):
        """
-        generates document representation via Word-Class-Embeddings.
-        Document embeddings are obtained via weighted sum of document's constituent embeddings.
+        Init WordClassGen.
        :param n_jobs: int, number of concurrent workers
        """
        super().__init__()
@ -134,6 +159,12 @@ class WordClassGen(ViewGen):
        self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)

    def fit(self, lX, ly):
+        """
+        (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
+        :param lX: dict {lang: indexed documents}
+        :param ly: dict {lang: target vectors}
+        :return: self.
+        """
        print('# Fitting WordClassGen (W)...')
        lX = self.vectorizer.fit_transform(lX)
        self.langs = sorted(lX.keys())
@ -144,6 +175,12 @@ class WordClassGen(ViewGen):
        return self

    def transform(self, lX):
+        """
+        (1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level,
+         finally (3) Apply L2 normalization embedding and returns it.
+        :param lX: dict {lang: indexed documents}
+        :return: document projection to the common latent space.
+        """
        lX = self.vectorizer.transform(lX)
        XdotWce = Parallel(n_jobs=self.n_jobs)(
            delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs)
@ -156,17 +193,28 @@ class WordClassGen(ViewGen):


 class RecurrentGen(ViewGen):
+    """
+    View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be
+    initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
+    Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns
+    the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
+    """
    def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
                 gpus=0, n_jobs=-1, stored_path=None):
        """
-        generates document embedding by means of a Gated Recurrent Units. The model can be
-        initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
-        Output dimension is (n_docs, 512).
-        :param multilingualIndex:
-        :param pretrained_embeddings:
-        :param wce:
-        :param gpus:
-        :param n_jobs:
+        Init RecurrentGen.
+        :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
+        indexed by language code.
+        :param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use
+        as embedding layer.
+        :param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised
+        embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to
+        the number of target classes.
+        :param batch_size: int, number of samples in a batch.
+        :param nepochs: int, number of max epochs to train the model.
+        :param gpus: int,  specifies how many GPUs to use per node. If False computation will take place on cpu.
+        :param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading).
+        :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
        """
        super().__init__()
        self.multilingualIndex = multilingualIndex
@ -212,14 +260,15 @@ class RecurrentGen(ViewGen):

    def fit(self, lX, ly):
        """
+        Train the Neural Network end-to-end.
        lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
        of the Dataset object (RecurrentDataset) in the GfunDataModule class.
-        :param lX:
-        :param ly:
-        :return:
+        :param lX: dict {lang: indexed documents}
+        :param ly: dict {lang: target vectors}
+        :return: self.
        """
        print('# Fitting RecurrentGen (G)...')
-        recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size)
+        recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs)
        trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs,
                          checkpoint_callback=False)

@ -236,9 +285,9 @@ class RecurrentGen(ViewGen):

    def transform(self, lX):
        """
-        Project documents to the common latent space
-        :param lX:
-        :return:
+        Project documents to the common latent space. Output dimensionality is 512.
+        :param lX: dict {lang: indexed documents}
+        :return: documents projected to the common latent space.
        """
        l_pad = self.multilingualIndex.l_pad()
        data = self.multilingualIndex.l_devel_index()
@ -255,7 +304,22 @@ class RecurrentGen(ViewGen):


 class BertGen(ViewGen):
+    """
+    View Generator (b):  generates document embedding via Bert model. The training happens end-to-end.
+    At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document
+    embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard.
+    """
    def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None):
+        """
+        Init Bert model
+        :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
+        indexed by language code.
+        :param batch_size: int, number of samples per batch.
+        :param nepochs: int, number of max epochs to train the model.
+        :param gpus: int,  specifies how many GPUs to use per node. If False computation will take place on cpu.
+        :param n_jobs: int, number of concurrent workers.
+        :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
+        """
        super().__init__()
        self.multilingualIndex = multilingualIndex
        self.nepochs = nepochs
@ -271,6 +335,14 @@ class BertGen(ViewGen):
        return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus)

    def fit(self, lX, ly):
+        """
+        Train the Neural Network end-to-end.
+        lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
+        of the Dataset object (RecurrentDataset) in the GfunDataModule class.
+        :param lX: dict {lang: indexed documents}
+        :param ly: dict {lang: target vectors}
+        :return: self.
+        """
        print('# Fitting BertGen (M)...')
        self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
        bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512)
@ -281,7 +353,11 @@ class BertGen(ViewGen):
        return self

    def transform(self, lX):
-        # lX is raw text data. It has to be first indexed via Bert Tokenizer.
+        """
+        Project documents to the common latent space. Output dimensionality is 768.
+        :param lX: dict {lang: indexed documents}
+        :return: documents projected to the common latent space.
+        """
        data = self.multilingualIndex.l_devel_raw_index()
        data = tokenize(data, max_len=512)
        self.model.to('cuda' if self.gpus else 'cpu')