cleaning and refactoring

2020-05-08 15:25:00 +02:00 · 2020-05-08 15:25:00 +02:00 · b539c597d6
parent 8ab808282a
commit b539c597d6
6 changed files with 249 additions and 92 deletions
--- a/src/data/fetch_enron_mail.py
+++ b/src/data/fetch_enron_mail.py
@ -11,13 +11,13 @@ from collections import Counter
 class EnronMail(AuthorshipDataset):

    NUM_AUTHORS = 150
-    MAX_MAIL_LENGHT = 5000 # in words
+    MAX_MAIL_WORDS = 500
    TEST_SIZE = 0.1
    MIN_TOKENS = 10

-    def __init__(self, mail_dir='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
-        self.mail_dir = mail_dir
-        super().__init__(mail_dir, n_authors, docs_by_author, n_open_set_authors, random_state)
+    def __init__(self, data_path='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
+        self.data_path = data_path
+        super().__init__(data_path, n_authors, docs_by_author, n_open_set_authors, random_state)

    def filter(self, base_str, filter_str):
        if filter_str in base_str:
@ -49,7 +49,7 @@ class EnronMail(AuthorshipDataset):

    def _get_most_prolific_authors(self, n):
        assert n <= EnronMail.NUM_AUTHORS, f'too many authors requested (maximum is {EnronMail.NUM_AUTHORS})'
-        author_paths = glob(self.mail_dir)
+        author_paths = glob(self.data_path)
        if n == -1:
            return author_paths
        author_count = Counter(
@ -90,9 +90,9 @@ def _fetch_emails_from_author(author_path, min_tokens):
            body_tokens = body.split()
            ntokens = len(body_tokens)
            if ntokens >= min_tokens:
-                if ntokens > EnronMail.MAX_MAIL_LENGHT:
+                if ntokens > EnronMail.MAX_MAIL_WORDS:
                    trimmed += 1
-                    body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_LENGHT])
+                    body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_WORDS])
                author_mails.append(body)
                author_docs += 1
                # if n_docs_by_author != -1 and author_docs >= n_docs_by_author:
--- a/src/index.py
+++ b/src/index.py
@ -1,6 +1,7 @@
 import numpy as np
 from sklearn.feature_extraction.text import CountVectorizer
 from tqdm import tqdm
+from util import parallelize


 class Index:
@ -9,7 +10,7 @@ class Index:
        :param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
        """
        self.vect = CountVectorizer(**kwargs)
-        self.unk = -1 # a valid index is assigned after fit
+        self.unk = -1  # a valid index is assigned after fit

    def fit(self, X):
        """
@ -22,11 +23,13 @@ class Index:
        self.unk = self.add_word('UNKTOKEN')
        return self

-    def transform(self, X):
+    def transform(self, X, n_jobs=-1):
        assert self.unk > 0, 'transform called before fit'
-        return np.asarray([
-            [self.vocabulary.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(X, desc='indexing')]
-        )
+        indexed = parallelize(func=self.index, args=[doc for doc in X], n_jobs=n_jobs)
+        return np.asarray(indexed)
+
+    def index(self, documents):
+        return [[self.vocabulary.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]

    def fit_transform(self, X):
        return self.fit(X).transform(X)
@ -49,4 +52,3 @@ class Index:



-
--- a/src/main.py
+++ b/src/main.py
@ -1,3 +1,4 @@
+import argparse
 import numpy as np
 from data.AuthorshipDataset import AuthorshipDataset
 from data.fetch_imdb62 import Imdb62
@ -8,82 +9,160 @@ from data.fetch_victorian import Victorian
 from evaluation import evaluation
 import torch
 from model.transformations import CNNProjection
+from util import create_path_if_not_exists
+import os
 import sys

-hidden_size=32
-channels_out=128
-output_size=1024
-kernel_sizes=[6,7,8]
-pad_length=3000
-batch_size=50
-n_epochs=256
-bigrams=False
-n_authors=50
-docs_by_author=-1
-seed=1

-debug=False
-if debug:
-    print(('*'*20)+' DEBUG MODE ' + ('*'*20))
-    hidden_size=16
-    output_size=32
-    pad_length=100
-    batch_size=10
-    n_epochs=20
-    n_authors = 50
-    docs_by_author = -1
+def main(opt):

-if torch.cuda.is_available():
-    device = torch.device('cuda')
-else:
-    device = torch.device('cpu')
-print(f'running on {device}')
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    print(f'running on {device}')

-dataset = AuthorshipDataset.load(EnronMail, pickle_path=f'../pickles/EnronMail{n_authors}_{seed}.pickle', mail_dir='../../authorship_analysis/data/enron_mail_20150507/maildir/*', n_authors=n_authors, docs_by_author=-1, random_state=seed)
-#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
-#dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
-Xtr, ytr = dataset.train.data, dataset.train.target
-Xte, yte = dataset.test.data, dataset.test.target
-A = np.unique(ytr)
-print(f'num authors={len(A)}')
-print(f'ntr = {len(Xtr)} nte = {len(Xte)}')
-#sys.exit(0)
+    # dataset load
+    if opt.dataset == 'enron':
+        loader = EnronMail
+        data_path = '../../authorship_analysis/data/enron_mail_20150507/maildir/*'
+    elif opt.dataset == 'imdb62':
+        loader = Imdb62
+        data_path = '../../authorship_analysis/data/imdb62/imdb62.txt'
+    elif opt.dataset == 'victorian':
+        loader = Victorian
+        data_path='../../authorship_analysis/data/victoria'

-index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
-Xtr = index.fit_transform(Xtr)
-Xte = index.transform(Xte)
-pad_index = index.add_word('PADTOKEN')
-print(f'vocabulary size={index.vocabulary_size()}')
+    dataset_name = f'{loader.__name__}_A{opt.authors}_D{opt.documents}_S{opt.seed}'
+    pickle_path = None
+    if opt.pickle:
+        pickle_path = f'{opt.pickle}/{dataset_name}.pickle'
+    dataset = AuthorshipDataset.load(loader,
+                                     pickle_path=pickle_path,
+                                     data_path=data_path,
+                                     n_authors=opt.authors,
+                                     docs_by_author=opt.documents,
+                                     random_state=opt.seed
+                                     )

-#shuffle1 = np.random.permutation(Xte.shape[0])
-#shuffle2 = np.random.permutation(Xte.shape[0])
-#x1, y1 = Xte[shuffle1], yte[shuffle1]
-#x2, y2 = Xte[shuffle2], yte[shuffle2]
-#paired_y = y1==y2
+    # dataset indexing
+    Xtr, ytr = dataset.train.data, dataset.train.target
+    Xte, yte = dataset.test.data, dataset.test.target
+    A = np.unique(ytr)
+    print(f'num authors={len(A)}')
+    print(f'ntr = {len(Xtr)} nte = {len(Xte)}')

-# attribution
-print('Attribution')
-#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
-phi = CNNProjection(vocabulary_size=index.vocabulary_size(), embedding_dim=hidden_size, out_size=output_size, channels_out=channels_out, kernel_sizes=kernel_sizes, dropout=0.5).to(device)
-cls = AuthorshipAttributionClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
-cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
-yte_ = cls.predict(Xte)
-evaluation(yte, yte_)
+    bigrams = False
+    index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
+    Xtr = index.fit_transform(Xtr)
+    Xte = index.transform(Xte)
+    pad_index = index.add_word('PADTOKEN')
+    print(f'vocabulary size={index.vocabulary_size()}')

-# verification
-#print('Verification')
-#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
-#cls = SameAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
-#cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
-#paired_y_ = cls.predict(x1,x2)
-#eval(paired_y, paired_y_)
+    #shuffle1 = np.random.permutation(Xte.shape[0])
+    #shuffle2 = np.random.permutation(Xte.shape[0])
+    #x1, y1 = Xte[shuffle1], yte[shuffle1]
+    #x2, y2 = Xte[shuffle2], yte[shuffle2]
+    #paired_y = y1==y2

-# attribution & verification
-#print('Attribution & Verification')
-#phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
-#cls = FullAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
-#cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
-#yte_ = cls.predict_labels(Xte)
-#eval(yte, yte_)
-#paired_y_ = cls.predict_sav(x1,x2)
-#eval(paired_y, paired_y_)
+    # attribution
+    print('Attribution')
+    phi = CNNProjection(
+        vocabulary_size=index.vocabulary_size(),
+        embedding_dim=opt.hidden,
+        out_size=opt.repr,
+        channels_out=opt.chout,
+        kernel_sizes=opt.kernelsizes,
+        dropout=0.5
+    ).to(device)
+
+    cls = AuthorshipAttributionClassifier(
+        phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device
+    )
+
+    if opt.name == 'auto':
+        method = f'{phi.__class__.__name__}_alpha{opt.alpha}'
+    else:
+        method = opt.name
+
+    # train
+    cls.fit(Xtr, ytr,
+            batch_size=opt.batchsize, epochs=opt.epochs, alpha=opt.alpha, lr=opt.lr,
+            log=f'{opt.log}/{method}-{dataset_name}.csv',
+            checkpointpath=opt.checkpoint
+            )
+
+    # test
+    yte_ = cls.predict(Xte)
+    acc, macrof1, microf1 = evaluation(yte, yte_)
+
+    results = Results(opt.output)
+    results.add(dataset_name, method, acc, macrof1, microf1)
+
+
+    # verification
+    #print('Verification')
+    #phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
+    #cls = SameAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
+    #cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
+    #paired_y_ = cls.predict(x1,x2)
+    #eval(paired_y, paired_y_)
+
+    # attribution & verification
+    #print('Attribution & Verification')
+    #phi = RNNProjection(vocab_size=index.vocabulary_size(), hidden_size=hidden_size, output_size=output_size, device=device)
+    #cls = FullAuthorClassifier(phi, num_authors=A.size, pad_index=pad_index, pad_length=pad_length, device=device)
+    #cls.fit(Xtr, ytr, batch_size=batch_size, epochs=n_epochs)
+    #yte_ = cls.predict_labels(Xte)
+    #eval(yte, yte_)
+    #paired_y_ = cls.predict_sav(x1,x2)
+    #eval(paired_y, paired_y_)
+
+
+class Results:
+    def __init__(self, path):
+        addheader = not os.path.exists(path)
+        self.foo = open(path, 'at')
+        if addheader:
+            self.add('Dataset', 'Method', 'Accuracy', 'MacroF1', 'microF1')
+
+    def add(self, dataset, method, acc, macrof1, microf1):
+        self.foo.write(f'{dataset}\t{method}\t{acc}\t{macrof1}\t{microf1}\n')
+        self.foo.flush()
+
+    def close(self):
+        self.foo.close()
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='CNN with KTA regularization')
+    parser.add_argument('-H', '--hidden', help='Hidden/embedding size', type=int, default=32)
+    parser.add_argument('-c', '--chout', help='Channels output size', type=int, default=128)
+    parser.add_argument('-r', '--repr', help='Projection size (phi)', type=int, default=1024)
+    parser.add_argument('-k', '--kernelsizes', help='Size of the convolutional kernels', nargs='+', default=[6,7,8])
+    parser.add_argument('-p', '--pad', help='Pad length', type=int, default=3000)
+    parser.add_argument('-b', '--batchsize', help='Batch size', type=int, default=50)
+    parser.add_argument('-e', '--epochs', help='Max number of epochs', type=int, default=250)
+    parser.add_argument('-A', '--authors', help='Number of authors (-1 to select all)', type=int, default=-1)
+    parser.add_argument('-D', '--documents', help='Number of documents per author (-1 to select all)', type=int, default=-1)
+    parser.add_argument('-s', '--seed', help='Random seed', type=int, default=-1)
+    parser.add_argument('-o', '--output', help='File where to write test results', default='../results.csv')
+    parser.add_argument('-l', '--log', help='Log dir where to output training an validation losses', default='../log')
+    parser.add_argument('-P', '--pickle', help='If specified, pickles a copy of the dataset for faster reload. '
+                                               'This parameter indicates a directory, the name of the pickle is '
+                                               'derived automatically.', default='../pickles')
+    parser.add_argument('-a', '--alpha', help='Controls the loss as attr-loss(alpha) + sav-loss(1-alpha)', type=float, default=1.)
+    parser.add_argument('--lr', help='Learning rate', type=float, default=0.001)
+    parser.add_argument('--checkpoint', help='Path where to dump model parameters', default='../checkpoint/model.dat')
+    parser.add_argument('-n', '--name', help='Name of the model', default='auto')
+    requiredNamed = parser.add_argument_group('required named arguments')
+    requiredNamed.add_argument('-d', '--dataset', help='Name of the dataset', required=True, type=str)
+    opt = parser.parse_args()
+
+    assert opt.dataset in ['enron', 'imdb62'], 'unknown dataset'
+
+    create_path_if_not_exists(opt.output)
+    create_path_if_not_exists(opt.log)
+    create_path_if_not_exists(opt.checkpoint)
+    if opt.pickle is not None:
+        create_path_if_not_exists(opt.pickle)
+
+    main(opt)
--- a/src/model/classifiers.py
+++ b/src/model/classifiers.py
@ -6,6 +6,8 @@ from tqdm import tqdm
 import math
 from sklearn.model_selection import train_test_split

+from model.early_stop import EarlyStop
+

 class AuthorshipAttributionClassifier(nn.Module):
    def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
@ -17,7 +19,9 @@ class AuthorshipAttributionClassifier(nn.Module):
        self.padder = Padding(pad_index=pad_index, max_length=pad_length, dynamic=True, pad_at_end=False, device=device)
        self.device = device

-    def fit(self, X, y, batch_size, epochs, lr=0.001, val_prop=0.1, log='../log/tmp.csv'):
+    def fit(self, X, y, batch_size, epochs, patience=20, lr=0.001, val_prop=0.1, alpha=1., log='../log/tmp.csv', checkpointpath='../checkpoint/model.dat'):
+        assert 0 <= alpha <= 1, 'wrong range, alpha must be in [0,1]'
+        early_stop = EarlyStop(patience)
        batcher = Batch(batch_size=batch_size, n_epochs=epochs)
        #batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=X.shape[0]//batch_size)
        batcher_val = Batch(batch_size=batch_size, n_epochs=epochs, shuffle=False)
@ -39,26 +43,35 @@ class AuthorshipAttributionClassifier(nn.Module):
                    xi = self.padder.transform(xi)
                    phi = self.projector(xi)

-                    logits = self.ff(phi)
-                    loss_attr = criterion(logits, torch.as_tensor(yi).to(self.device))
+                    loss_attr = loss_sav = 0
+                    loss_attr_value = loss_sav_value = -1

-                    kernel = torch.matmul(phi, phi.T)
-                    ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
-                    loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
+                    if alpha > 0:
+                        logits = self.ff(phi)
+                        loss_attr = criterion(logits, torch.as_tensor(yi).to(self.device))
+                        loss_attr_value = loss_attr.item()

-                    loss = loss_attr + 0*loss_sav
+                    if alpha < 1:
+                        kernel = torch.matmul(phi, phi.T)
+                        ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
+                        loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
+                        loss_sav_value = loss_sav.item()
+
+                    loss = loss_attr*alpha + loss_sav*(1.-alpha)

                    loss.backward()
                    optim.step()

-                    attr_losses.append(loss_attr.item())
-                    sav_losses.append(loss_sav.item())
+                    attr_losses.append(loss_attr_value)
+                    sav_losses.append(loss_sav_value)
                    losses.append(loss.item())
                    tr_loss = np.mean(losses)
                    pbar.set_description(f'training epoch={epoch} '
                                         f'loss={tr_loss:.5f} '
                                         f'attr-loss={np.mean(attr_losses):.5f} '
-                                         f'sav-loss={np.mean(sav_losses):.5f} val_loss={val_loss:.5f}')
+                                         f'sav-loss={np.mean(sav_losses):.5f} '
+                                         f'val_loss={val_loss:.5f}'
+                                         )

                # validation
                self.eval()
@ -80,6 +93,14 @@ class AuthorshipAttributionClassifier(nn.Module):
                foo.write(f'{epoch}\t{tr_loss:.8f}\t{val_loss:.8f}\t{acc:.3f}\t{macrof1:.3f}\t{microf1:.3f}\n')
                foo.flush()

+                early_stop(microf1, epoch)
+                if early_stop.IMPROVED:
+                    torch.save(self.state_dict(), checkpointpath)
+                elif early_stop.STOP:
+                    break
+        print(f'training ended; loading best model parameters in {checkpointpath} for epoch {early_stop.best_epoch}')
+        self.load_state_dict(torch.load(checkpointpath))
+
    def predict(self, x, batch_size=100):
        self.eval()
        batcher = Batch(batch_size=batch_size, n_epochs=1, shuffle=False)
--- a/src/model/early_stop.py
+++ b/src/model/early_stop.py
@ -0,0 +1,26 @@
+import torch
+from time import time
+
+
+class EarlyStop:
+
+    def __init__(self, patience=20):
+        self.patience_limit = patience
+        self.patience = patience
+        self.best_score = None
+        self.best_epoch = None
+        self.STOP = False
+        self.IMPROVED = False
+
+    def __call__(self, watch_score, epoch):
+        if self.best_score is None or watch_score >= self.best_score:
+            self.IMPROVED = True
+            self.best_score = watch_score
+            self.best_epoch = epoch
+            self.patience = self.patience_limit
+        else:
+            self.IMPROVED = False
+            self.patience -= 1
+            if self.patience <= 0:
+                self.STOP = True
+
--- a/src/util.py
+++ b/src/util.py
@ -0,0 +1,29 @@
+from pathlib import Path
+import os
+from joblib import Parallel, delayed
+import multiprocessing
+import itertools
+
+
+def create_path_if_not_exists(file):
+    parent_path = Path(file).parent
+    os.makedirs(parent_path, exist_ok=True)
+
+
+# given the number of tasks and the number of jobs, generates the slices for the parallel threads
+def get_parallel_slices(n_tasks, n_jobs=-1):
+    if n_jobs==-1:
+        n_jobs = multiprocessing.cpu_count()
+    batch = int(n_tasks / n_jobs)
+    remainder = n_tasks % n_jobs
+    return [slice(job*batch, (job+1)*batch + (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
+
+
+def parallelize(func, args, n_jobs):
+    slices = get_parallel_slices(len(args), n_jobs)
+
+    results = Parallel(n_jobs=n_jobs)(
+        delayed(func)(args[slice_i]) for slice_i in slices
+    )
+    return list(itertools.chain.from_iterable(results))
+