enron mail

This commit is contained in:
Alejandro Moreo Fernandez 2020-05-07 14:03:47 +02:00
parent 0be3e5547e
commit 8ab808282a
4 changed files with 154 additions and 16 deletions

View File

@ -2,16 +2,24 @@ from abc import ABC, abstractmethod
import random
import numpy as np
from collections import Counter
import os
import pickle
class LabelledCorpus:
def __init__(self, documents, labels):
if not isinstance(documents, np.ndarray): documents = np.asarray(documents)
if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str)
if not isinstance(labels, np.ndarray): labels = np.asarray(labels)
self.data = documents
self.target = labels
def _tolist(self):
self.data = self.data.tolist()
def _toarray(self):
self.data = np.asarray(self.data, dtype=str)
def __len__(self):
return len(self.data)
@ -27,8 +35,28 @@ class LabelledCorpus:
class AuthorshipDataset(ABC):
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors = 0, random_state=42):
@classmethod
def load(cls, loader, pickle_path=None, **kwargs):
#assert isinstance(loader, AuthorshipDataset), 'unknown loader'
if pickle_path and os.path.exists(pickle_path):
print(f'loading dataset image in {pickle_path}')
dataset = pickle.load(open(pickle_path, 'rb'))
dataset.train._toarray()
dataset.test._toarray()
else:
dataset = loader(**kwargs)
if pickle_path:
print(f'dumping dataset in {pickle_path} for faster load')
dataset.train._tolist()
dataset.test._tolist()
pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
dataset.train._toarray()
dataset.test._toarray()
return dataset
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
self.data_path = data_path
self.n_authors = n_authors
random.seed(random_state)
np.random.seed(random_state)
@ -45,17 +73,14 @@ class AuthorshipDataset(ABC):
super().__init__()
@abstractmethod
def _fetch_and_split(self):
pass
@abstractmethod
def _check_n_authors(self, n_authors, n_open_set_authors):
pass
def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors):
if n_authors != -1 or n_docs_by_author != -1:
@ -88,7 +113,6 @@ class AuthorshipDataset(ABC):
else:
self.test_out = None
# reindex labels so that the unique labels are equal to range(#num_different_authors)
# and unique training labels are range(#num_different_training_authors)
def _remove_label_gaps(self):
@ -131,11 +155,13 @@ class AuthorshipDataset(ABC):
return
author_doc_count = Counter(self.train.target)
to_remove = frozenset([id for id,count in author_doc_count.most_common() if count<docs_by_author])
to_remove = frozenset([id for id, count in author_doc_count.most_common() if count < docs_by_author])
assert len(to_remove) < len(author_doc_count), 'impossible selection'
if len(to_remove)>0:
if len(to_remove) > 0:
self.train = LabelledCorpus.filter(self.train, to_remove)
self.test = LabelledCorpus.filter(self.test, to_remove)
self.target_names = sorted(set(self.target_names) - to_remove)

View File

@ -0,0 +1,107 @@
import eml_parser
from glob import glob
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus
import numpy as np
from joblib import Parallel, delayed
from collections import Counter
class EnronMail(AuthorshipDataset):
NUM_AUTHORS = 150
MAX_MAIL_LENGHT = 5000 # in words
TEST_SIZE = 0.1
MIN_TOKENS = 10
def __init__(self, mail_dir='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
self.mail_dir = mail_dir
super().__init__(mail_dir, n_authors, docs_by_author, n_open_set_authors, random_state)
def filter(self, base_str, filter_str):
if filter_str in base_str:
idx = base_str.index(filter_str)
base_str = base_str[:idx]
return base_str
def _fetch_and_split(self):
labels = []
data = []
path_list = self._get_most_prolific_authors(self.n_authors)
emails_authors = Parallel(n_jobs=-1)(
delayed(_fetch_emails_from_author)(author_path, EnronMail.MIN_TOKENS) for author_path in path_list
)
for emails, author in emails_authors:
data.extend(emails)
labels.extend([author]*len(emails))
target_names = sorted(np.unique(labels))
train_data, test_data, train_labels, test_labels = \
train_test_split(data, labels, test_size=EnronMail.TEST_SIZE, stratify=labels)
return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names
def _check_n_authors(self, n_authors, n_open_set_authors):
pass
def _get_most_prolific_authors(self, n):
assert n <= EnronMail.NUM_AUTHORS, f'too many authors requested (maximum is {EnronMail.NUM_AUTHORS})'
author_paths = glob(self.mail_dir)
if n == -1:
return author_paths
author_count = Counter(
{author_path :
len(glob(f'{author_path}/sent/*.')) +
len(glob(f'{author_path}/sent_items/*.'))
for author_path in author_paths
})
return [path for path, count in author_count.most_common(n)]
def _fetch_emails_from_author(author_path, min_tokens):
subject_filters = ['fw:', 'fwd:', 're:']
body_filters = ['-----Original Message-----', '----- Forward', 'cc:', 'To:', 'to:', 'From:', 'from:']
parsed_mails = 0
author_mails = []
author_docs = 0
author_name = author_path[author_path.rindex('/') + 1:]
author_bar = tqdm(list(glob(f'{author_path}/sent/*.')) + list(glob(f'{author_path}/sent_items/*.')))
errors, trimmed = 0, 0
for email in author_bar:
author_bar.set_description(f'parsing for {author_path} errors={errors} trimmed={trimmed}')
raw_email = open(email, 'rb').read()
try:
parsed_mail = eml_parser.eml_parser.decode_email_b(raw_email, include_raw_body=True)
# subject = parsed_mail['header']['subject']
body = parsed_mail['body'][0]['content']
# for filter in subject_filters:
# if filter in subject.lower():
# continue
# for filter in body_filters:
# body = self.filter(body, filter)
# body = subject+'\n'+body
body_tokens = body.split()
ntokens = len(body_tokens)
if ntokens >= min_tokens:
if ntokens > EnronMail.MAX_MAIL_LENGHT:
trimmed += 1
body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_LENGHT])
author_mails.append(body)
author_docs += 1
# if n_docs_by_author != -1 and author_docs >= n_docs_by_author:
# add_author = True
# break
parsed_mails += 1
except Exception:
errors += 1
print(f'ERROR in file {email}')
return author_mails, author_name

View File

@ -5,7 +5,7 @@ def evaluation(y_true, y_pred):
acc = accuracy_score(y_true, y_pred)
macrof1 = f1_score(y_true, y_pred, average='macro')
microf1 = f1_score(y_true, y_pred, average='micro')
print(f'acc={acc * 100:.2f}%')
print(f'macro-f1={macrof1:.2f}')
print(f'micro-f1={microf1:.2f}')
print(f'acc={acc * 100:.4f}%')
print(f'macro-f1={macrof1:.4f}')
print(f'micro-f1={microf1:.4f}')
return acc, macrof1, microf1

View File

@ -1,6 +1,7 @@
import numpy as np
from data.AuthorshipDataset import AuthorshipDataset
from data.fetch_imdb62 import Imdb62
from data.fetch_enron_mail import EnronMail
from index import Index
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
from data.fetch_victorian import Victorian
@ -17,8 +18,9 @@ pad_length=3000
batch_size=50
n_epochs=256
bigrams=False
n_authors=-1
n_authors=50
docs_by_author=-1
seed=1
debug=False
if debug:
@ -28,8 +30,8 @@ if debug:
pad_length=100
batch_size=10
n_epochs=20
n_authors = 5
docs_by_author = 10
n_authors = 50
docs_by_author = -1
if torch.cuda.is_available():
device = torch.device('cuda')
@ -37,12 +39,15 @@ else:
device = torch.device('cpu')
print(f'running on {device}')
dataset = AuthorshipDataset.load(EnronMail, pickle_path=f'../pickles/EnronMail{n_authors}_{seed}.pickle', mail_dir='../../authorship_analysis/data/enron_mail_20150507/maildir/*', n_authors=n_authors, docs_by_author=-1, random_state=seed)
#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
#dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
Xtr, ytr = dataset.train.data, dataset.train.target
Xte, yte = dataset.test.data, dataset.test.target
A = np.unique(ytr)
print(f'num authors={len(A)}')
print(f'ntr = {len(Xtr)} nte = {len(Xte)}')
#sys.exit(0)
index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
Xtr = index.fit_transform(Xtr)