From 8ab808282a0899c153d6a1f349c3c70a4490652a Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 7 May 2020 14:03:47 +0200 Subject: [PATCH] enron mail --- src/data/AuthorshipDataset.py | 42 ++++++++++--- src/data/fetch_enron_mail.py | 107 ++++++++++++++++++++++++++++++++++ src/evaluation.py | 6 +- src/main.py | 15 +++-- 4 files changed, 154 insertions(+), 16 deletions(-) create mode 100644 src/data/fetch_enron_mail.py diff --git a/src/data/AuthorshipDataset.py b/src/data/AuthorshipDataset.py index e3b33cc..972ba2d 100644 --- a/src/data/AuthorshipDataset.py +++ b/src/data/AuthorshipDataset.py @@ -2,16 +2,24 @@ from abc import ABC, abstractmethod import random import numpy as np from collections import Counter +import os +import pickle class LabelledCorpus: def __init__(self, documents, labels): - if not isinstance(documents, np.ndarray): documents = np.asarray(documents) + if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str) if not isinstance(labels, np.ndarray): labels = np.asarray(labels) self.data = documents self.target = labels + def _tolist(self): + self.data = self.data.tolist() + + def _toarray(self): + self.data = np.asarray(self.data, dtype=str) + def __len__(self): return len(self.data) @@ -27,8 +35,28 @@ class LabelledCorpus: class AuthorshipDataset(ABC): - def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors = 0, random_state=42): + @classmethod + def load(cls, loader, pickle_path=None, **kwargs): + #assert isinstance(loader, AuthorshipDataset), 'unknown loader' + if pickle_path and os.path.exists(pickle_path): + print(f'loading dataset image in {pickle_path}') + dataset = pickle.load(open(pickle_path, 'rb')) + dataset.train._toarray() + dataset.test._toarray() + else: + dataset = loader(**kwargs) + if pickle_path: + print(f'dumping dataset in {pickle_path} for faster load') + dataset.train._tolist() + dataset.test._tolist() + pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL) + dataset.train._toarray() + dataset.test._toarray() + return dataset + + def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42): self.data_path = data_path + self.n_authors = n_authors random.seed(random_state) np.random.seed(random_state) @@ -45,17 +73,14 @@ class AuthorshipDataset(ABC): super().__init__() - @abstractmethod def _fetch_and_split(self): pass - @abstractmethod def _check_n_authors(self, n_authors, n_open_set_authors): pass - def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors): if n_authors != -1 or n_docs_by_author != -1: @@ -88,7 +113,6 @@ class AuthorshipDataset(ABC): else: self.test_out = None - # reindex labels so that the unique labels are equal to range(#num_different_authors) # and unique training labels are range(#num_different_training_authors) def _remove_label_gaps(self): @@ -131,11 +155,13 @@ class AuthorshipDataset(ABC): return author_doc_count = Counter(self.train.target) - to_remove = frozenset([id for id,count in author_doc_count.most_common() if count0: + if len(to_remove) > 0: self.train = LabelledCorpus.filter(self.train, to_remove) self.test = LabelledCorpus.filter(self.test, to_remove) self.target_names = sorted(set(self.target_names) - to_remove) + + diff --git a/src/data/fetch_enron_mail.py b/src/data/fetch_enron_mail.py new file mode 100644 index 0000000..ad82d85 --- /dev/null +++ b/src/data/fetch_enron_mail.py @@ -0,0 +1,107 @@ +import eml_parser +from glob import glob +from sklearn.model_selection import train_test_split +from tqdm import tqdm +from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus +import numpy as np +from joblib import Parallel, delayed +from collections import Counter + + +class EnronMail(AuthorshipDataset): + + NUM_AUTHORS = 150 + MAX_MAIL_LENGHT = 5000 # in words + TEST_SIZE = 0.1 + MIN_TOKENS = 10 + + def __init__(self, mail_dir='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42): + self.mail_dir = mail_dir + super().__init__(mail_dir, n_authors, docs_by_author, n_open_set_authors, random_state) + + def filter(self, base_str, filter_str): + if filter_str in base_str: + idx = base_str.index(filter_str) + base_str = base_str[:idx] + return base_str + + def _fetch_and_split(self): + labels = [] + data = [] + + path_list = self._get_most_prolific_authors(self.n_authors) + emails_authors = Parallel(n_jobs=-1)( + delayed(_fetch_emails_from_author)(author_path, EnronMail.MIN_TOKENS) for author_path in path_list + ) + for emails, author in emails_authors: + data.extend(emails) + labels.extend([author]*len(emails)) + + target_names = sorted(np.unique(labels)) + + train_data, test_data, train_labels, test_labels = \ + train_test_split(data, labels, test_size=EnronMail.TEST_SIZE, stratify=labels) + + return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names + + def _check_n_authors(self, n_authors, n_open_set_authors): + pass + + def _get_most_prolific_authors(self, n): + assert n <= EnronMail.NUM_AUTHORS, f'too many authors requested (maximum is {EnronMail.NUM_AUTHORS})' + author_paths = glob(self.mail_dir) + if n == -1: + return author_paths + author_count = Counter( + {author_path : + len(glob(f'{author_path}/sent/*.')) + + len(glob(f'{author_path}/sent_items/*.')) + for author_path in author_paths + }) + return [path for path, count in author_count.most_common(n)] + + +def _fetch_emails_from_author(author_path, min_tokens): + subject_filters = ['fw:', 'fwd:', 're:'] + body_filters = ['-----Original Message-----', '----- Forward', 'cc:', 'To:', 'to:', 'From:', 'from:'] + + parsed_mails = 0 + author_mails = [] + author_docs = 0 + author_name = author_path[author_path.rindex('/') + 1:] + author_bar = tqdm(list(glob(f'{author_path}/sent/*.')) + list(glob(f'{author_path}/sent_items/*.'))) + errors, trimmed = 0, 0 + for email in author_bar: + author_bar.set_description(f'parsing for {author_path} errors={errors} trimmed={trimmed}') + raw_email = open(email, 'rb').read() + try: + parsed_mail = eml_parser.eml_parser.decode_email_b(raw_email, include_raw_body=True) + # subject = parsed_mail['header']['subject'] + body = parsed_mail['body'][0]['content'] + + # for filter in subject_filters: + # if filter in subject.lower(): + # continue + + # for filter in body_filters: + # body = self.filter(body, filter) + + # body = subject+'\n'+body + body_tokens = body.split() + ntokens = len(body_tokens) + if ntokens >= min_tokens: + if ntokens > EnronMail.MAX_MAIL_LENGHT: + trimmed += 1 + body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_LENGHT]) + author_mails.append(body) + author_docs += 1 + # if n_docs_by_author != -1 and author_docs >= n_docs_by_author: + # add_author = True + # break + + parsed_mails += 1 + except Exception: + errors += 1 + print(f'ERROR in file {email}') + + return author_mails, author_name diff --git a/src/evaluation.py b/src/evaluation.py index afb58b4..69c3be7 100644 --- a/src/evaluation.py +++ b/src/evaluation.py @@ -5,7 +5,7 @@ def evaluation(y_true, y_pred): acc = accuracy_score(y_true, y_pred) macrof1 = f1_score(y_true, y_pred, average='macro') microf1 = f1_score(y_true, y_pred, average='micro') - print(f'acc={acc * 100:.2f}%') - print(f'macro-f1={macrof1:.2f}') - print(f'micro-f1={microf1:.2f}') + print(f'acc={acc * 100:.4f}%') + print(f'macro-f1={macrof1:.4f}') + print(f'micro-f1={microf1:.4f}') return acc, macrof1, microf1 diff --git a/src/main.py b/src/main.py index 4832c1f..f735230 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,7 @@ import numpy as np - +from data.AuthorshipDataset import AuthorshipDataset from data.fetch_imdb62 import Imdb62 +from data.fetch_enron_mail import EnronMail from index import Index from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier from data.fetch_victorian import Victorian @@ -17,8 +18,9 @@ pad_length=3000 batch_size=50 n_epochs=256 bigrams=False -n_authors=-1 +n_authors=50 docs_by_author=-1 +seed=1 debug=False if debug: @@ -28,8 +30,8 @@ if debug: pad_length=100 batch_size=10 n_epochs=20 - n_authors = 5 - docs_by_author = 10 + n_authors = 50 + docs_by_author = -1 if torch.cuda.is_available(): device = torch.device('cuda') @@ -37,12 +39,15 @@ else: device = torch.device('cpu') print(f'running on {device}') +dataset = AuthorshipDataset.load(EnronMail, pickle_path=f'../pickles/EnronMail{n_authors}_{seed}.pickle', mail_dir='../../authorship_analysis/data/enron_mail_20150507/maildir/*', n_authors=n_authors, docs_by_author=-1, random_state=seed) #dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25) -dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author) +#dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author) Xtr, ytr = dataset.train.data, dataset.train.target Xte, yte = dataset.test.data, dataset.test.target A = np.unique(ytr) print(f'num authors={len(A)}') +print(f'ntr = {len(Xtr)} nte = {len(Xte)}') +#sys.exit(0) index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1)) Xtr = index.fit_transform(Xtr)