refactoring

2021-02-04 10:26:05 +01:00 · 2021-02-04 10:26:05 +01:00 · 1cd9ec251a
parent c0c116fd66
commit 1cd9ec251a
9 changed files with 417 additions and 330 deletions
--- a/Notes.txt
+++ b/Notes.txt
@ -4,3 +4,12 @@ b) unos "simplified" que son peores que los de Ruder porque he quitado ese layer
 También vi que se mejoraba con l2(phi(x)) así que lo he dejado así
 Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando
 Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación...
+
+Lo de la l2 es un requisito de supervised contrastive learning (SCL)
+El problema para aplicar SCL es entender qué quiere decir el "crop" en texto, y en particular en AA. Podría simplemente
+    ser equivalente a "fragmento", es decir, que un tipo de inductive bias es que un fragmento de un texto de un autor
+    debe tener una representación similar a otro fragmento del mismo texto. Hay que entender bien cómo generarlos,
+    de forma que los fragmentos sean caracterizantes (esto quiere decir probablemente imponer una cierta extensión).
+    También hay que entender cómo tratar los solapamientos entre fragmentos.
+
+Una idea de título sería: "AA is to Classification as SCL is to SAV", or AA = Classif - SCL + SAV
--- a/TODO.txt
+++ b/TODO.txt
@ -1,3 +1,31 @@
+Recap Feb. 2021:
+- Adapt everything to testing a classic neural training for AA (i.e., projector+classifier training) vs. applying Supervised
+    Contrastive Learning (SCL) as a pretraining step for solving SAV, and then training a linear classifier with
+    the projector network frozen. Reassess the work in terms of SAV and made connections with KTA and SVM. Maybe claim
+    that SCL+SVM is the way to go.
+- Compare (Attribution):
+    - S.Ruder systems
+    - My system (projector+classifier layer) as a reimplementation of S.Ruder's systems
+    - Projector trained via SCL + Classifier layer trained alone.
+    - Projector trained via SCL + SVM Classifier.
+    - Projector trained via KTA + SVM Classifier.
+- Compare (SAV):
+    - My system (projector+binary-classifier layer)
+    - Projector trained via SCL + Binary Classifier layer trained alone.
+    - Projector trained via SCL + SVM Classifier.
+    - Projector trained via KTA + SVM Classifier.
+    - Other systems (maybe Diff-Vectors, maybe Impostors, maybe distance-based)
+- Additional experiments:
+    - show the kernel matrix
+
+Future:
+- Test also in general TC? there are some torch datasets in torchtext that could simplify things... but that would
+    blur the idea of SCL-SAV
+
+Code:
+- redo dataset in terms of pytorch's data_loader
+
+---------------------
 Things to clarify:

 about the network:
@ -23,4 +51,6 @@ maybe I have to review the validation of the sav-loss; since it is batched, it m
 SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1?
    I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1

+plot the kernel matrix as an imshow, with rows/cols arranged by authors, and check whether the KTA that SCL yields
+    is better than that obtained using a traditional training for attribution.

--- a/experiments.sh
+++ b/experiments.sh
@ -1,14 +0,0 @@
-#!/bin/bash
-conda activate torch
-
-dataset=enron
-for authors in 10 50 ;  do
-  for alpha in 1 0.999 0.99 0.9 0.5 ; do
-    python main.py --dataset $dataset -A $authors -s 0 -o ../results_$dataset.csv --alpha $alpha
-  done
-done
-
-dataset=imdb62
-for alpha in 1 0.999 0.99 0.9 0.5 ; do
-  python main.py --dataset $dataset -A -1 -s 0 -o ../results_$dataset.csv --alpha $alpha
-done
--- a/src/data/AuthorshipDataset.py
+++ b/src/data/AuthorshipDataset.py
@ -9,17 +9,13 @@ import pickle
 class LabelledCorpus:

    def __init__(self, documents, labels):
-        if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str)
-        if not isinstance(labels, np.ndarray): labels = np.asarray(labels)
+        if not isinstance(documents, np.ndarray):
+            documents = np.asarray(documents, dtype=object)  #dtype=str occupies too much in memory and is not needed
+        if not isinstance(labels, np.ndarray):
+            labels = np.asarray(labels)
        self.data = documents
        self.target = labels

-    def _tolist(self):
-        self.data = self.data.tolist()
-
-    def _toarray(self):
-        self.data = np.asarray(self.data, dtype=str)
-
    def __len__(self):
        return len(self.data)

@ -41,17 +37,11 @@ class AuthorshipDataset(ABC):
        if pickle_path and os.path.exists(pickle_path):
            print(f'loading dataset image in {pickle_path}')
            dataset = pickle.load(open(pickle_path, 'rb'))
-            dataset.train._toarray()
-            dataset.test._toarray()
        else:
            dataset = loader(**kwargs)
            if pickle_path:
                print(f'dumping dataset in {pickle_path} for faster load')
-                dataset.train._tolist()
-                dataset.test._tolist()
                pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
-                dataset.train._toarray()
-                dataset.test._toarray()
        return dataset

    def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
@ -62,13 +52,9 @@ class AuthorshipDataset(ABC):
        np.random.seed(random_state)

        self._check_n_authors(n_authors, n_open_set_authors)
-
        self.train, self.test, self.target_names = self._fetch_and_split()
-
        self._assure_docs_by_author(docs_by_author)
-
        self._reduce_authors_documents(n_authors, docs_by_author, n_open_set_authors)
-
        self._remove_label_gaps()

        super().__init__()
--- a/src/data/fetch_imdb62.py
+++ b/src/data/fetch_imdb62.py
@ -18,7 +18,7 @@ class Imdb62(AuthorshipDataset):
    def _fetch_and_split(self):
        file = open(self.data_path,'rt', encoding= "utf-8").readlines()
        splits = [line.split('\t') for line in file]
-        reviews = np.asarray([split[4]+' '+split[5] for split in splits])
+        reviews = [split[4]+' '+split[5] for split in splits]

        authors=[]
        authors_ids = dict()
--- a/src/data/fetch_victorian.py
+++ b/src/data/fetch_victorian.py
@ -19,7 +19,6 @@ class Victorian(AuthorshipDataset):
            csv_reader = csv.reader(file, delimiter = ',')
            next(csv_reader)
            for row in csv_reader:
-                # if row[0]!='text':
                data.append(row[0])
                labels.append(int(row[1]))

--- a/src/main.py
+++ b/src/main.py
@ -5,7 +5,7 @@ from data.fetch_blogs import Blogs
 from data.fetch_imdb62 import Imdb62
 from data.fetch_enron_mail import EnronMail
 from index import Index
-from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
+from model.classifiers import AuthorshipAttributionClassifier #, SameAuthorClassifier, FullAuthorClassifier
 from data.fetch_victorian import Victorian
 from evaluation import evaluation
 import torch
@ -16,11 +16,7 @@ import os
 import sys


-def main(opt):
-
-    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-    print(f'running on {device}')
-
+def load_dataset(opt):
    # dataset load
    if opt.dataset == 'enron':
        loader = EnronMail
@ -39,13 +35,24 @@ def main(opt):
    pickle_path = None
    if opt.pickle:
        pickle_path = f'{opt.pickle}/{dataset_name}.pickle'
-    dataset = AuthorshipDataset.load(loader,
+    dataset = AuthorshipDataset.load(
+        loader,
        pickle_path=pickle_path,
        data_path=data_path,
        n_authors=opt.authors,
        docs_by_author=opt.documents,
        random_state=opt.seed
    )
+    return dataset_name, dataset
+
+
+
+def main(opt):
+
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    print(f'running on {device}')
+
+    dataset_name, dataset = load_dataset(opt)

    # dataset indexing
    Xtr, ytr = dataset.train.data, dataset.train.target
@ -61,12 +68,6 @@ def main(opt):
    pad_index = index.add_word('PADTOKEN')
    print(f'vocabulary size={index.vocabulary_size()}')

-    #shuffle1 = np.random.permutation(Xte.shape[0])
-    #shuffle2 = np.random.permutation(Xte.shape[0])
-    #x1, y1 = Xte[shuffle1], yte[shuffle1]
-    #x2, y2 = Xte[shuffle2], yte[shuffle2]
-    #paired_y = y1==y2
-
    # attribution
    print('Attribution')
    phi = Phi(
@ -93,6 +94,13 @@ def main(opt):
    else:
        method = opt.name

+    cls.supervised_contrastive_learning(Xtr, ytr,
+            batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
+            log=f'{opt.log}/{method}-{dataset_name}.csv',
+            checkpointpath=opt.checkpoint)
+
+    sys.exit(0)
+
    # train
    val_microf1 = cls.fit(Xtr, ytr,
            batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
@ -154,7 +162,7 @@ if __name__ == '__main__':
    parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250)
    parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1)
    parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1)
-    parser.add_argument('-s', '--seed', help='Random seed', type=int, default=-1)
+    parser.add_argument('-s', '--seed', help='Random seed', type=int, default=0)
    parser.add_argument('-o', '--output', help='File where to write test results', default='../results.csv')
    parser.add_argument('-l', '--log', help='Log dir where to output training an validation losses', default='../log')
    parser.add_argument('-P', '--pickle', help='If specified, pickles a copy of the dataset for faster reload. '
--- a/src/model/classifiers.py
+++ b/src/model/classifiers.py
@ -6,8 +6,11 @@ from sklearn.metrics import accuracy_score, f1_score
 from tqdm import tqdm
 import math
 from sklearn.model_selection import train_test_split
+
+from losses import SupConLoss1View
 from model.early_stop import EarlyStop
 from model.layers import FFProjection
+from torch.utils.data import DataLoader


 class AuthorshipAttributionClassifier(nn.Module):
@ -17,33 +20,35 @@ class AuthorshipAttributionClassifier(nn.Module):
        self.ff = FFProjection(input_size=projector.output_size,
                               hidden_sizes=[],
                               output_size=num_authors).to(device)
-        self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device)
+        self.pad_index = pad_index
+        self.pad_length = pad_length
        self.device = device

    def fit(self, X, y, batch_size, epochs, patience=10, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'):
        assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]'
        early_stop = EarlyStop(patience)
-        batcher = Batch(batch_size=batch_size, n_epochs=epochs)
+
        #batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=X.shape[0]//batch_size)
-        batcher_val = Batch(batch_size=batch_size, n_epochs=epochs, shuffle=False)
        criterion = torch.nn.CrossEntropyLoss().to(self.device)
        savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device)
        optim = torch.optim.Adam(self.parameters(), lr=lr)

        X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y)

+        tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device)
+        val_data = IndexedDataset(Xval, yval, self.pad_length, self.pad_index, self.device)
+
        with open(log, 'wt') as foo:
            print()
            foo.write('epoch\ttr-loss\tval-loss\tval-acc\tval-Mf1\tval-mf1\n')
            tr_loss, val_loss = -1, -1
-            pbar = tqdm(range(1, batcher.n_epochs+1))
+            pbar = tqdm(range(1, epochs + 1))
            for epoch in pbar:
                # training
                self.train()
                losses, attr_losses, sav_losses = [], [], []
-                for xi, yi in batcher.epoch(X, y):
+                for xi, yi in tr_data.asDataLoader(batch_size, shuffle=True):
                    optim.zero_grad()
-                    xi = self.padder.transform(xi)
                    phi = self.projector(xi)

                    loss_attr = loss_sav = 0
@ -93,9 +98,11 @@ class AuthorshipAttributionClassifier(nn.Module):

                # validation
                self.eval()
+                with torch.no_grad:
                    predictions, losses = [], []
-                for xi, yi in batcher_val.epoch(Xval, yval):
-                    xi = self.padder.transform(xi)
+                    # for xi, yi in batcher_val.epoch(Xval, yval):
+                    for xi, yi in val_data.asDataLoader(batch_size, shuffle=False):
+                        # xi = self.padder.transform(xi)
                        logits = self.forward(xi)
                        loss = criterion(logits, torch.as_tensor(yi).to(self.device))
                        losses.append(loss.item())
@ -120,12 +127,78 @@ class AuthorshipAttributionClassifier(nn.Module):
        self.load_state_dict(torch.load(checkpointpath))
        return early_stop.best_score

+    def supervised_contrastive_learning(self, X, y, batch_size, epochs, patience=10, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'):
+        assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]'
+        early_stop = EarlyStop(patience)
+
+        criterion = SupConLoss1View().to(self.device)
+        optim = torch.optim.Adam(self.parameters(), lr=lr)
+
+        X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y)
+
+        tr_data = IndexedDataset(X, y, self.pad_length, self.pad_index, self.device)
+        val_data = IndexedDataset(Xval, yval, self.pad_length, self.pad_index, self.device)
+
+        with open(log, 'wt') as foo:
+            print()
+            foo.write('epoch\ttr-loss\tval-loss\tval-acc\tval-Mf1\tval-mf1\n')
+            tr_loss, val_loss = -1, -1
+            pbar = tqdm(range(1, epochs + 1))
+            for epoch in pbar:
+                # training
+                self.train()
+                losses = []
+                for xi, yi in tr_data.asDataLoader(batch_size, shuffle=True):
+                    optim.zero_grad()
+                    phi = self.projector(xi)
+                    contrastive_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
+                    contrastive_loss.backward()
+                    optim.step()
+                    losses.append(contrastive_loss.item())
+                    tr_loss = np.mean(losses)
+                    pbar.set_description(f'training epoch={epoch} '
+                                         f'loss={tr_loss:.5f} '
+                                         f'val_loss={val_loss:.5f} '
+                                         f'patience={early_stop.patience}/{early_stop.patience_limit}')
+
+                # validation
+                # self.eval()
+                # with torch.no_grad:
+                #     predictions, losses = [], []
+                #     for xi, yi in val_data.asDataLoader(batch_size, shuffle=False):
+                #         phi = self.projector(xi)
+                #         contrastive_loss = criterion(phi, torch.as_tensor(yi).to(self.device))
+                #
+                #         logits = self.forward(xi)
+                #         loss = criterion(logits, torch.as_tensor(yi).to(self.device))
+                #         losses.append(loss.item())
+                #         logits = nn.functional.log_softmax(logits, dim=1)
+                #         prediction = tensor2numpy(torch.argmax(logits, dim=1).view(-1))
+                #         predictions.append(prediction)
+                #     val_loss = np.mean(losses)
+                #     predictions = np.concatenate(predictions)
+                #     acc = accuracy_score(yval, predictions)
+                #     macrof1 = f1_score(yval, predictions, average='macro')
+                #     microf1 = f1_score(yval, predictions, average='micro')
+                #
+                #     foo.write(f'{epoch}\t{tr_loss:.8f}\t{val_loss:.8f}\t{acc:.3f}\t{macrof1:.3f}\t{microf1:.3f}\n')
+                #     foo.flush()
+
+                # early_stop(microf1, epoch)
+                # if early_stop.IMPROVED:
+                #     torch.save(self.state_dict(), checkpointpath)
+                # elif early_stop.STOP:
+                #     break
+        print(f'training ended; loading best model parameters in {checkpointpath} for epoch {early_stop.best_epoch}')
+        self.load_state_dict(torch.load(checkpointpath))
+        return early_stop.best_score
+
    def predict(self, x, batch_size=100):
        self.eval()
-        batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
+        te_data = IndexedDataset(x, None, self.pad_length, self.pad_index, self.device)
        predictions = []
-        for xi in tqdm(batcher.epoch(x), desc='test'):
-            xi = self.padder.transform(xi)
+        with torch.no_grad:
+            for xi, yi in te_data.asDataLoader(batch_size, shuffle=False):
                logits = self.forward(xi)
                logits = nn.functional.log_softmax(logits, dim=1)
                prediction = tensor2numpy(torch.argmax(logits, dim=1).view(-1))
@ -168,134 +241,133 @@ def choose_sav_pairs(y, npairs):



-class SameAuthorClassifier(nn.Module):
-    def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
-        super(SameAuthorClassifier, self).__init__()
-        self.projector = projector.to(device)
-        self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device)
-        self.device = device
-
-    def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100):
-        self.train()
-        batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch)
-        optim = torch.optim.Adam(self.parameters(), lr=lr)
-
-        pbar = tqdm(range(batcher.n_epochs))
-        for epoch in pbar:
-            losses = []
-            for xi, yi in batcher.epoch(X, y):
-                optim.zero_grad()
-                xi = self.padder.transform(xi)
-                phi = self.projector(xi)
-                #normalize phi to have norm 1? maybe better as the last step of projector
-                kernel = torch.matmul(phi, phi.T)
-                ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
-                loss = KernelAlignmentLoss(kernel, ideal_kernel)
-                loss.backward()
-                #clip_gradient(model)
-                optim.step()
-                losses.append(loss.item())
-                pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}')
-
-    def predict(self, x, z, batch_size=100):
-        self.eval()
-        batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
-        predictions = []
-        for xi, zi in tqdm(batcher.epoch(x, z), desc='test'):
-            xi = self.padder.transform(xi)
-            zi = self.padder.transform(zi)
-            inners = self.forward(xi, zi)
-            prediction = tensor2numpy(inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}?
-            predictions.append(prediction)
-        return np.concatenate(predictions)
-
-    def forward(self, x, z):
-        assert x.shape == z.shape, 'shape mismatch between matrices x and z'
-        phi_x = self.projector(x)
-        phi_z = self.projector(z)
-        rows, cols = phi_x.shape
-        pairwise_inners = torch.bmm(phi_x.view(rows, 1, cols), phi_z.view(rows, cols, 1)).squeeze()
-        return pairwise_inners
+# class SameAuthorClassifier(nn.Module):
+#     def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
+#         super(SameAuthorClassifier, self).__init__()
+#         self.projector = projector.to(device)
+#         self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device)
+#         self.device = device
+#
+#     def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100):
+#         self.train()
+#         batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch)
+#         optim = torch.optim.Adam(self.parameters(), lr=lr)
+#
+#         pbar = tqdm(range(batcher.n_epochs))
+#         for epoch in pbar:
+#             losses = []
+#             for xi, yi in batcher.epoch(X, y):
+#                 optim.zero_grad()
+#                 xi = self.padder.transform(xi)
+#                 phi = self.projector(xi)
+#                 #normalize phi to have norm 1? maybe better as the last step of projector
+#                 kernel = torch.matmul(phi, phi.T)
+#                 ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
+#                 loss = KernelAlignmentLoss(kernel, ideal_kernel)
+#                 loss.backward()
+#                 #clip_gradient(model)
+#                 optim.step()
+#                 losses.append(loss.item())
+#                 pbar.set_description(f'training epoch={epoch} loss={np.mean(losses):.5f}')
+#
+#     def predict(self, x, z, batch_size=100):
+#         self.eval()
+#         batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
+#         predictions = []
+#         for xi, zi in tqdm(batcher.epoch(x, z), desc='test'):
+#             xi = self.padder.transform(xi)
+#             zi = self.padder.transform(zi)
+#             inners = self.forward(xi, zi)
+#             prediction = tensor2numpy(inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}?
+#             predictions.append(prediction)
+#         return np.concatenate(predictions)
+#
+#     def forward(self, x, z):
+#         assert x.shape == z.shape, 'shape mismatch between matrices x and z'
+#         phi_x = self.projector(x)
+#         phi_z = self.projector(z)
+#         rows, cols = phi_x.shape
+#         pairwise_inners = torch.bmm(phi_x.view(rows, 1, cols), phi_z.view(rows, cols, 1)).squeeze()
+#         return pairwise_inners


-class FullAuthorClassifier(nn.Module):
-    def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
-        super(FullAuthorClassifier, self).__init__()
-        self.projector = projector.to(device)
-        self.ff = FFProjection(input_size=projector.space_dimensions(),
-                               hidden_sizes=[1024],
-                               output_size=num_authors).to(device)
-        self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device)
-        self.device = device
-
-    def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100):
-        self.train()
-        batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch)
-        criterion = torch.nn.CrossEntropyLoss().to(self.device)
-        optim = torch.optim.Adam(self.parameters(), lr=lr)
-        alpha = 0.5
-
-        pbar = tqdm(range(batcher.n_epochs))
-        for epoch in pbar:
-            losses, sav_losses, attr_losses = [], [], []
-            for xi, yi in batcher.epoch(X, y):
-                optim.zero_grad()
-                xi = self.padder.transform(xi)
-                phi = self.projector(xi)
-                #normalize phi to have norm 1? maybe better as the last step of projector
-
-                #sav-loss
-                kernel = torch.matmul(phi, phi.T)
-                ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
-                sav_loss = KernelAlignmentLoss(kernel, ideal_kernel)
-                sav_losses.append(sav_loss.item())
-
-                #attr-loss
-                logits = self.ff(phi)
-                attr_loss = criterion(logits, torch.as_tensor(yi).to(self.device))
-                attr_losses.append(attr_loss.item())
-
-                #loss
-                loss = (alpha)*sav_loss + (1-alpha)*attr_loss
-                losses.append(loss.item())
-
-                loss.backward()
-                #clip_gradient(model)
-                optim.step()
-                pbar.set_description(
-                    f'training epoch={epoch} '
-                    f'sav-loss={np.mean(sav_losses):.5f} '
-                    f'attr-loss={np.mean(attr_losses):.5f} '
-                    f'loss={np.mean(losses):.5f}'
-                )
-
-    def predict_sav(self, x, z, batch_size=100):
-        self.eval()
-        batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
-        predictions = []
-        for xi, zi in tqdm(batcher.epoch(x, z), desc='test'):
-            xi = self.padder.transform(xi)
-            zi = self.padder.transform(zi)
-            phi_xi = self.projector(xi)
-            phi_zi = self.projector(zi)
-            rows, cols = phi_xi.shape
-            pairwise_inners = torch.bmm(phi_xi.view(rows, 1, cols), phi_zi.view(rows, cols, 1)).squeeze()
-            prediction = tensor2numpy(pairwise_inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}?
-            predictions.append(prediction)
-        return np.concatenate(predictions)
-
-    def predict_labels(self, x, batch_size=100):
-        self.eval()
-        batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
-        predictions = []
-        for xi in tqdm(batcher.epoch(x), desc='test'):
-            xi = self.padder.transform(xi)
-            phi = self.projector(xi)
-            logits = self.ff(phi)
-            prediction = tensor2numpy( torch.argmax(logits, dim=1).view(-1))
-            predictions.append(prediction)
-        return np.concatenate(predictions)
-
+# class FullAuthorClassifier(nn.Module):
+#     def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
+#         super(FullAuthorClassifier, self).__init__()
+#         self.projector = projector.to(device)
+#         self.ff = FFProjection(input_size=projector.space_dimensions(),
+#                                hidden_sizes=[1024],
+#                                output_size=num_authors).to(device)
+#         self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device)
+#         self.device = device
+#
+#     def fit(self, X, y, batch_size, epochs, lr=0.001, steps_per_epoch=100):
+#         self.train()
+#         batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=steps_per_epoch)
+#         criterion = torch.nn.CrossEntropyLoss().to(self.device)
+#         optim = torch.optim.Adam(self.parameters(), lr=lr)
+#         alpha = 0.5
+#
+#         pbar = tqdm(range(batcher.n_epochs))
+#         for epoch in pbar:
+#             losses, sav_losses, attr_losses = [], [], []
+#             for xi, yi in batcher.epoch(X, y):
+#                 optim.zero_grad()
+#                 xi = self.padder.transform(xi)
+#                 phi = self.projector(xi)
+#                 #normalize phi to have norm 1? maybe better as the last step of projector
+#
+#                 #sav-loss
+#                 kernel = torch.matmul(phi, phi.T)
+#                 ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
+#                 sav_loss = KernelAlignmentLoss(kernel, ideal_kernel)
+#                 sav_losses.append(sav_loss.item())
+#
+#                 #attr-loss
+#                 logits = self.ff(phi)
+#                 attr_loss = criterion(logits, torch.as_tensor(yi).to(self.device))
+#                 attr_losses.append(attr_loss.item())
+#
+#                 #loss
+#                 loss = (alpha)*sav_loss + (1-alpha)*attr_loss
+#                 losses.append(loss.item())
+#
+#                 loss.backward()
+#                 #clip_gradient(model)
+#                 optim.step()
+#                 pbar.set_description(
+#                     f'training epoch={epoch} '
+#                     f'sav-loss={np.mean(sav_losses):.5f} '
+#                     f'attr-loss={np.mean(attr_losses):.5f} '
+#                     f'loss={np.mean(losses):.5f}'
+#                 )
+#
+#     def predict_sav(self, x, z, batch_size=100):
+#         self.eval()
+#         batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
+#         predictions = []
+#         for xi, zi in tqdm(batcher.epoch(x, z), desc='test'):
+#             xi = self.padder.transform(xi)
+#             zi = self.padder.transform(zi)
+#             phi_xi = self.projector(xi)
+#             phi_zi = self.projector(zi)
+#             rows, cols = phi_xi.shape
+#             pairwise_inners = torch.bmm(phi_xi.view(rows, 1, cols), phi_zi.view(rows, cols, 1)).squeeze()
+#             prediction = tensor2numpy(pairwise_inners) > 0.5 # is this correct? should it be > 0 and the ideal kernel in field {-1,+1}?
+#             predictions.append(prediction)
+#         return np.concatenate(predictions)
+#
+#     def predict_labels(self, x, batch_size=100):
+#         self.eval()
+#         batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
+#         predictions = []
+#         for xi in tqdm(batcher.epoch(x), desc='test'):
+#             xi = self.padder.transform(xi)
+#             phi = self.projector(xi)
+#             logits = self.ff(phi)
+#             prediction = tensor2numpy( torch.argmax(logits, dim=1).view(-1))
+#             predictions.append(prediction)
+#         return np.concatenate(predictions)

 #def KernelAlignmentLoss(K, Y):
 #    n_el = K.shape[0]*K.shape[1]
@ -304,92 +376,89 @@ class FullAuthorClassifier(nn.Module):
 #    return loss


-
-class Batch:
-    def __init__(self, batch_size, n_epochs=1, shuffle=True):
-        self.batch_size = batch_size
-        self.n_epochs = n_epochs
-        self.shuffle = shuffle
-        self.current_epoch = 0
-
-    def epoch(self, *args):
-        lengths = list(map(len, args))
-        assert max(lengths) == min(lengths), 'inconsistent sizes in args'
-        n_batches = math.ceil(lengths[0] / self.batch_size)
-        offset = 0
-        if self.shuffle:
-            index = np.random.permutation(len(args[0]))
-            args = [arg[index] for arg in args]
-        for b in range(n_batches):
-            batch_idx = slice(offset, offset+self.batch_size)
-            batch = [arg[batch_idx] for arg in args]
-            yield batch if len(batch) > 1 else batch[0]
-            offset += self.batch_size
-        self.current_epoch += 1
-
-
-class TwoClassBatch:
-    """
-    given a X and y (multi-label) produces batches of elements of X, y for two classes (e.g., c1, c2)
-    of equal size, i.e., the batch is [(x1,c1), ..., (xn,c1), (xn+1,c2), ..., (x2n,c2)]
-    """
-    def __init__(self, batch_size, n_epochs, steps_per_epoch):
-        self.batch_size = batch_size
-        self.n_epochs = n_epochs
-        self.steps_per_epoch = steps_per_epoch
-        self.current_epoch = 0
-        if self.batch_size % 2 != 0:
-            raise ValueError('warning, batch size is not even')
-
-    def epoch(self, X, y):
-        n_el = len(y)
-        assert X.shape[0] == n_el, 'inconsistent sizes in X, y'
-        classes = np.unique(y)
-        groups = {ci: X[y==ci] for ci in classes}
-        class_prevalences = [len(groups[ci])/n_el for ci in classes]
-        n_choices = self.batch_size // 2
-
-        for b in range(self.steps_per_epoch):
-            class1, class2 = np.random.choice(classes, p=class_prevalences, size=2, replace=False)
-            X1 = np.random.choice(groups[class1], size=n_choices)
-            X2 = np.random.choice(groups[class2], size=n_choices)
-            X_batch = np.concatenate([X1,X2])
-            y_batch = np.repeat([class1, class2], repeats=[n_choices,n_choices])
-            yield X_batch, y_batch
-        self.current_epoch += 1
-
-
-class Padding:
-    def __init__(self, pad_index, max_length, dynamic=True, pad_at_end=True, device='cpu'):
-        """
-        :param pad_index: the index representing the PAD token
-        :param max_length: the length that defines the padding
-        :param dynamic: if True (default) pads at min(max_length, max_local_length) where max_local_length is the
-        length of the longest example
-        :param pad_at_end: if True, the pad tokens are added at the end of the lists, if otherwise they are added
-        at the beginning
-        """
-        self.pad = pad_index
-        self.max_length = max_length
-        self.dynamic = dynamic
-        self.pad_at_end = pad_at_end
-        self.device = device
-
-    def transform(self, X):
-        """
-        :param X: a list of lists of indexes (integers)
-        :return: a ndarray of shape (n,m) where n is the number of elements in X and m is the pad length (the maximum
-        in elements of X if dynamic, or self.max_length if otherwise)
-        """
-        X = [x[:self.max_length] for x in X]
-        lengths = list(map(len, X))
-        pad_length = min(max(lengths), self.max_length) if self.dynamic else self.max_length
-        if self.pad_at_end:
-            padded = [x + [self.pad] * (pad_length - x_len) for x, x_len in zip(X, lengths)]
-        else:
-            padded = [[self.pad] * (pad_length - x_len) + x for x, x_len in zip(X, lengths)]
-        return torch.from_numpy(np.asarray(padded, dtype=int)).to(self.device)
+# class TwoClassBatch:
+#     """
+#     given a X and y (multi-label) produces batches of elements of X, y for two classes (e.g., c1, c2)
+#     of equal size, i.e., the batch is [(x1,c1), ..., (xn,c1), (xn+1,c2), ..., (x2n,c2)]
+#     """
+#     def __init__(self, batch_size, n_epochs, steps_per_epoch):
+#         self.batch_size = batch_size
+#         self.n_epochs = n_epochs
+#         self.steps_per_epoch = steps_per_epoch
+#         self.current_epoch = 0
+#         if self.batch_size % 2 != 0:
+#             raise ValueError('warning, batch size is not even')
+#
+#     def epoch(self, X, y):
+#         n_el = len(y)
+#         assert X.shape[0] == n_el, 'inconsistent sizes in X, y'
+#         classes = np.unique(y)
+#         groups = {ci: X[y==ci] for ci in classes}
+#         class_prevalences = [len(groups[ci])/n_el for ci in classes]
+#         n_choices = self.batch_size // 2
+#
+#         for b in range(self.steps_per_epoch):
+#             class1, class2 = np.random.choice(classes, p=class_prevalences, size=2, replace=False)
+#             X1 = np.random.choice(groups[class1], size=n_choices)
+#             X2 = np.random.choice(groups[class2], size=n_choices)
+#             X_batch = np.concatenate([X1,X2])
+#             y_batch = np.repeat([class1, class2], repeats=[n_choices,n_choices])
+#             yield X_batch, y_batch
+#         self.current_epoch += 1


 def tensor2numpy(t):
    return t.to('cpu').detach().numpy()
+
+
+# ------------
+
+class IndexedDataset(torch.utils.data.Dataset):
+    def __init__(self, X, y, MAX_LENGTH, padindex, device, pad_at_end=False):
+        self.X = X
+        self.y = y
+        self.MAX_LENGTH = MAX_LENGTH
+        self.padindex = padindex
+        self.device = device
+        self.pad_at_end = pad_at_end
+
+    def __len__(self):
+        return len(self.X)
+    
+    @property
+    def islabelled(self):
+        return self.y is not None
+
+    def __getitem__(self, index):
+        if self.islabelled:
+            return self.X[index], self.y[index]
+        else:
+            return self.X[index]
+        
+    def collate_pad_fn(self, batch):
+        """
+        :param batch: a list of lists of indexes (integers)
+        :return: a torch.tensor of shape (n,m) where n is the number of elements in X_batch and m is the pad length
+        (the maximum in elements of X_batch)
+        """
+        if self.islabelled:
+            X, y = list(zip(*batch))
+        else:
+            X = batch
+        lengths = list(map(len, X))
+        pad_length = min(max(lengths), self.MAX_LENGTH)
+        X = [x[:pad_length] for x in X]
+        if self.pad_at_end:
+            padded = [x + [self.padindex] * (pad_length - x_len) for x, x_len in zip(X, lengths)]
+        else:
+            padded = [[self.padindex] * (pad_length - x_len) + x for x, x_len in zip(X, lengths)]
+
+        X = torch.from_numpy(np.asarray(padded, dtype=int)).to(self.device)
+        if self.islabelled:
+            y = torch.from_numpy(np.asarray(y)).to(self.device)
+            return X, y
+        else:
+            return X
+
+    def asDataLoader(self, batch_size, shuffle):
+        return torch.utils.data.DataLoader(self, batch_size=batch_size, shuffle=shuffle, collate_fn=self.collate_pad_fn)
--- a/src/model/layers.py
+++ b/src/model/layers.py
@ -70,36 +70,36 @@ class FFProjection(nn.Module):


 # deprecated
-class RNNProjection(nn.Module):
-    def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
-        super(RNNProjection, self).__init__()
-        self.output_size = output_size
-        self.hidden_size = hidden_size
-        self.vocab_size = vocab_size
-        self.num_layers=1
-        self.num_directions=1
-        self.device = device
-
-        self.embedding = nn.Embedding(vocab_size, hidden_size).to(device)
-        self.rnn = nn.GRU(
-            input_size=hidden_size,
-            hidden_size=hidden_size,
-            num_layers=self.num_layers,
-            bidirectional=(self.num_directions == 2),
-            batch_first=True
-        ).to(device)
-        self.projection = nn.Linear(self.num_layers * self.num_directions * self.hidden_size, output_size).to(device)
-
-    def init_hidden(self, batch_size):
-        return torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device)
-
-    def forward(self, x):
-        batch_size = x.shape[0]
-        x = self.embedding(x)
-        output, hn = self.rnn(x, self.init_hidden(batch_size))
-        hn = hn.view(self.num_layers, self.num_directions, batch_size, self.hidden_size)
-        hn = hn.permute(2, 0, 1, 3).reshape(batch_size, -1)
-        return self.projection(hn)
-
-    def space_dimensions(self):
-        return self.output_size
+# class RNNProjection(nn.Module):
+#     def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
+#         super(RNNProjection, self).__init__()
+#         self.output_size = output_size
+#         self.hidden_size = hidden_size
+#         self.vocab_size = vocab_size
+#         self.num_layers=1
+#         self.num_directions=1
+#         self.device = device
+#
+#         self.embedding = nn.Embedding(vocab_size, hidden_size).to(device)
+#         self.rnn = nn.GRU(
+#             input_size=hidden_size,
+#             hidden_size=hidden_size,
+#             num_layers=self.num_layers,
+#             bidirectional=(self.num_directions == 2),
+#             batch_first=True
+#         ).to(device)
+#         self.projection = nn.Linear(self.num_layers * self.num_directions * self.hidden_size, output_size).to(device)
+#
+#     def init_hidden(self, batch_size):
+#         return torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size).to(self.device)
+#
+#     def forward(self, x):
+#         batch_size = x.shape[0]
+#         x = self.embedding(x)
+#         output, hn = self.rnn(x, self.init_hidden(batch_size))
+#         hn = hn.view(self.num_layers, self.num_directions, batch_size, self.hidden_size)
+#         hn = hn.permute(2, 0, 1, 3).reshape(batch_size, -1)
+#         return self.projection(hn)
+#
+#     def space_dimensions(self):
+#         return self.output_size