enron mail

2020-05-07 14:03:47 +02:00 · 2020-05-07 14:03:47 +02:00 · 8ab808282a
parent 0be3e5547e
commit 8ab808282a
4 changed files with 154 additions and 16 deletions
--- a/src/data/AuthorshipDataset.py
+++ b/src/data/AuthorshipDataset.py
@ -2,16 +2,24 @@ from abc import ABC, abstractmethod
 import random
 import numpy as np
 from collections import Counter
+import os
+import pickle


 class LabelledCorpus:

    def __init__(self, documents, labels):
-        if not isinstance(documents, np.ndarray): documents = np.asarray(documents)
+        if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str)
        if not isinstance(labels, np.ndarray): labels = np.asarray(labels)
        self.data = documents
        self.target = labels

+    def _tolist(self):
+        self.data = self.data.tolist()
+
+    def _toarray(self):
+        self.data = np.asarray(self.data, dtype=str)
+
    def __len__(self):
        return len(self.data)

@ -27,8 +35,28 @@ class LabelledCorpus:

 class AuthorshipDataset(ABC):

-    def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors = 0, random_state=42):
+    @classmethod
+    def load(cls, loader, pickle_path=None, **kwargs):
+        #assert isinstance(loader, AuthorshipDataset), 'unknown loader'
+        if pickle_path and os.path.exists(pickle_path):
+            print(f'loading dataset image in {pickle_path}')
+            dataset = pickle.load(open(pickle_path, 'rb'))
+            dataset.train._toarray()
+            dataset.test._toarray()
+        else:
+            dataset = loader(**kwargs)
+            if pickle_path:
+                print(f'dumping dataset in {pickle_path} for faster load')
+                dataset.train._tolist()
+                dataset.test._tolist()
+                pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+                dataset.train._toarray()
+                dataset.test._toarray()
+        return dataset
+
+    def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
        self.data_path = data_path
+        self.n_authors = n_authors

        random.seed(random_state)
        np.random.seed(random_state)
@ -45,17 +73,14 @@ class AuthorshipDataset(ABC):

        super().__init__()

-
    @abstractmethod
    def _fetch_and_split(self):
        pass

-
    @abstractmethod
    def _check_n_authors(self, n_authors, n_open_set_authors):
        pass

-
    def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors):

        if n_authors != -1 or n_docs_by_author != -1:
@ -88,7 +113,6 @@ class AuthorshipDataset(ABC):
        else:
            self.test_out = None

-
    # reindex labels so that the unique labels are equal to range(#num_different_authors)
    # and unique training labels are range(#num_different_training_authors)
    def _remove_label_gaps(self):
@ -131,11 +155,13 @@ class AuthorshipDataset(ABC):
            return

        author_doc_count = Counter(self.train.target)
-        to_remove = frozenset([id for id,count in author_doc_count.most_common() if count<docs_by_author])
+        to_remove = frozenset([id for id, count in author_doc_count.most_common() if count < docs_by_author])
        assert len(to_remove) < len(author_doc_count), 'impossible selection'
-        if len(to_remove)>0:
+        if len(to_remove) > 0:
            self.train = LabelledCorpus.filter(self.train, to_remove)
            self.test  = LabelledCorpus.filter(self.test,  to_remove)
            self.target_names = sorted(set(self.target_names) - to_remove)


+
+
--- a/src/data/fetch_enron_mail.py
+++ b/src/data/fetch_enron_mail.py
@ -0,0 +1,107 @@
+import eml_parser
+from glob import glob
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm
+from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus
+import numpy as np
+from joblib import Parallel, delayed
+from collections import Counter
+
+
+class EnronMail(AuthorshipDataset):
+
+    NUM_AUTHORS = 150
+    MAX_MAIL_LENGHT = 5000 # in words
+    TEST_SIZE = 0.1
+    MIN_TOKENS = 10
+
+    def __init__(self, mail_dir='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
+        self.mail_dir = mail_dir
+        super().__init__(mail_dir, n_authors, docs_by_author, n_open_set_authors, random_state)
+
+    def filter(self, base_str, filter_str):
+        if filter_str in base_str:
+            idx = base_str.index(filter_str)
+            base_str = base_str[:idx]
+        return base_str
+
+    def _fetch_and_split(self):
+        labels = []
+        data = []
+
+        path_list = self._get_most_prolific_authors(self.n_authors)
+        emails_authors = Parallel(n_jobs=-1)(
+            delayed(_fetch_emails_from_author)(author_path, EnronMail.MIN_TOKENS) for author_path in path_list
+        )
+        for emails, author in emails_authors:
+            data.extend(emails)
+            labels.extend([author]*len(emails))
+
+        target_names = sorted(np.unique(labels))
+
+        train_data, test_data, train_labels, test_labels = \
+            train_test_split(data, labels, test_size=EnronMail.TEST_SIZE, stratify=labels)
+
+        return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names
+
+    def _check_n_authors(self, n_authors, n_open_set_authors):
+        pass
+
+    def _get_most_prolific_authors(self, n):
+        assert n <= EnronMail.NUM_AUTHORS, f'too many authors requested (maximum is {EnronMail.NUM_AUTHORS})'
+        author_paths = glob(self.mail_dir)
+        if n == -1:
+            return author_paths
+        author_count = Counter(
+            {author_path :
+                 len(glob(f'{author_path}/sent/*.')) +
+                 len(glob(f'{author_path}/sent_items/*.'))
+             for author_path in author_paths
+             })
+        return [path for path, count in author_count.most_common(n)]
+
+
+def _fetch_emails_from_author(author_path, min_tokens):
+    subject_filters = ['fw:', 'fwd:', 're:']
+    body_filters = ['-----Original Message-----', '----- Forward', 'cc:', 'To:', 'to:', 'From:', 'from:']
+
+    parsed_mails = 0
+    author_mails = []
+    author_docs = 0
+    author_name = author_path[author_path.rindex('/') + 1:]
+    author_bar = tqdm(list(glob(f'{author_path}/sent/*.')) + list(glob(f'{author_path}/sent_items/*.')))
+    errors, trimmed = 0, 0
+    for email in author_bar:
+        author_bar.set_description(f'parsing for {author_path} errors={errors} trimmed={trimmed}')
+        raw_email = open(email, 'rb').read()
+        try:
+            parsed_mail = eml_parser.eml_parser.decode_email_b(raw_email, include_raw_body=True)
+            # subject = parsed_mail['header']['subject']
+            body = parsed_mail['body'][0]['content']
+
+            # for filter in subject_filters:
+            #    if filter in subject.lower():
+            #        continue
+
+            # for filter in body_filters:
+            #    body = self.filter(body, filter)
+
+            # body = subject+'\n'+body
+            body_tokens = body.split()
+            ntokens = len(body_tokens)
+            if ntokens >= min_tokens:
+                if ntokens > EnronMail.MAX_MAIL_LENGHT:
+                    trimmed += 1
+                    body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_LENGHT])
+                author_mails.append(body)
+                author_docs += 1
+                # if n_docs_by_author != -1 and author_docs >= n_docs_by_author:
+                #    add_author = True
+                #    break
+
+            parsed_mails += 1
+        except Exception:
+            errors += 1
+            print(f'ERROR in file {email}')
+
+    return author_mails, author_name
--- a/src/evaluation.py
+++ b/src/evaluation.py
@ -5,7 +5,7 @@ def evaluation(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    macrof1 = f1_score(y_true, y_pred, average='macro')
    microf1 = f1_score(y_true, y_pred, average='micro')
-    print(f'acc={acc * 100:.2f}%')
-    print(f'macro-f1={macrof1:.2f}')
-    print(f'micro-f1={microf1:.2f}')
+    print(f'acc={acc * 100:.4f}%')
+    print(f'macro-f1={macrof1:.4f}')
+    print(f'micro-f1={microf1:.4f}')
    return acc, macrof1, microf1
--- a/src/main.py
+++ b/src/main.py
@ -1,6 +1,7 @@
 import numpy as np
-
+from data.AuthorshipDataset import AuthorshipDataset
 from data.fetch_imdb62 import Imdb62
+from data.fetch_enron_mail import EnronMail
 from index import Index
 from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
 from data.fetch_victorian import Victorian
@ -17,8 +18,9 @@ pad_length=3000
 batch_size=50
 n_epochs=256
 bigrams=False
-n_authors=-1
+n_authors=50
 docs_by_author=-1
+seed=1

 debug=False
 if debug:
@ -28,8 +30,8 @@ if debug:
    pad_length=100
    batch_size=10
    n_epochs=20
-    n_authors = 5
-    docs_by_author = 10
+    n_authors = 50
+    docs_by_author = -1

 if torch.cuda.is_available():
    device = torch.device('cuda')
@ -37,12 +39,15 @@ else:
    device = torch.device('cpu')
 print(f'running on {device}')

+dataset = AuthorshipDataset.load(EnronMail, pickle_path=f'../pickles/EnronMail{n_authors}_{seed}.pickle', mail_dir='../../authorship_analysis/data/enron_mail_20150507/maildir/*', n_authors=n_authors, docs_by_author=-1, random_state=seed)
 #dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
-dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
+#dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
 Xtr, ytr = dataset.train.data, dataset.train.target
 Xte, yte = dataset.test.data, dataset.test.target
 A = np.unique(ytr)
 print(f'num authors={len(A)}')
+print(f'ntr = {len(Xtr)} nte = {len(Xte)}')
+#sys.exit(0)

 index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
 Xtr = index.fit_transform(Xtr)