enron mail
This commit is contained in:
parent
0be3e5547e
commit
8ab808282a
|
@ -2,16 +2,24 @@ from abc import ABC, abstractmethod
|
|||
import random
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import os
|
||||
import pickle
|
||||
|
||||
|
||||
class LabelledCorpus:
|
||||
|
||||
def __init__(self, documents, labels):
|
||||
if not isinstance(documents, np.ndarray): documents = np.asarray(documents)
|
||||
if not isinstance(documents, np.ndarray): documents = np.asarray(documents, dtype=str)
|
||||
if not isinstance(labels, np.ndarray): labels = np.asarray(labels)
|
||||
self.data = documents
|
||||
self.target = labels
|
||||
|
||||
def _tolist(self):
|
||||
self.data = self.data.tolist()
|
||||
|
||||
def _toarray(self):
|
||||
self.data = np.asarray(self.data, dtype=str)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
|
@ -27,8 +35,28 @@ class LabelledCorpus:
|
|||
|
||||
class AuthorshipDataset(ABC):
|
||||
|
||||
@classmethod
|
||||
def load(cls, loader, pickle_path=None, **kwargs):
|
||||
#assert isinstance(loader, AuthorshipDataset), 'unknown loader'
|
||||
if pickle_path and os.path.exists(pickle_path):
|
||||
print(f'loading dataset image in {pickle_path}')
|
||||
dataset = pickle.load(open(pickle_path, 'rb'))
|
||||
dataset.train._toarray()
|
||||
dataset.test._toarray()
|
||||
else:
|
||||
dataset = loader(**kwargs)
|
||||
if pickle_path:
|
||||
print(f'dumping dataset in {pickle_path} for faster load')
|
||||
dataset.train._tolist()
|
||||
dataset.test._tolist()
|
||||
pickle.dump(dataset, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
dataset.train._toarray()
|
||||
dataset.test._toarray()
|
||||
return dataset
|
||||
|
||||
def __init__(self, data_path, n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
|
||||
self.data_path = data_path
|
||||
self.n_authors = n_authors
|
||||
|
||||
random.seed(random_state)
|
||||
np.random.seed(random_state)
|
||||
|
@ -45,17 +73,14 @@ class AuthorshipDataset(ABC):
|
|||
|
||||
super().__init__()
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def _fetch_and_split(self):
|
||||
pass
|
||||
|
||||
|
||||
@abstractmethod
|
||||
def _check_n_authors(self, n_authors, n_open_set_authors):
|
||||
pass
|
||||
|
||||
|
||||
def _reduce_authors_documents(self, n_authors, n_docs_by_author, n_open_set_authors):
|
||||
|
||||
if n_authors != -1 or n_docs_by_author != -1:
|
||||
|
@ -88,7 +113,6 @@ class AuthorshipDataset(ABC):
|
|||
else:
|
||||
self.test_out = None
|
||||
|
||||
|
||||
# reindex labels so that the unique labels are equal to range(#num_different_authors)
|
||||
# and unique training labels are range(#num_different_training_authors)
|
||||
def _remove_label_gaps(self):
|
||||
|
@ -139,3 +163,5 @@ class AuthorshipDataset(ABC):
|
|||
self.target_names = sorted(set(self.target_names) - to_remove)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
import eml_parser
|
||||
from glob import glob
|
||||
from sklearn.model_selection import train_test_split
|
||||
from tqdm import tqdm
|
||||
from data.AuthorshipDataset import AuthorshipDataset, LabelledCorpus
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class EnronMail(AuthorshipDataset):
|
||||
|
||||
NUM_AUTHORS = 150
|
||||
MAX_MAIL_LENGHT = 5000 # in words
|
||||
TEST_SIZE = 0.1
|
||||
MIN_TOKENS = 10
|
||||
|
||||
def __init__(self, mail_dir='../data/enron_mail_20150507/maildir/*', n_authors=-1, docs_by_author=-1, n_open_set_authors=0, random_state=42):
|
||||
self.mail_dir = mail_dir
|
||||
super().__init__(mail_dir, n_authors, docs_by_author, n_open_set_authors, random_state)
|
||||
|
||||
def filter(self, base_str, filter_str):
|
||||
if filter_str in base_str:
|
||||
idx = base_str.index(filter_str)
|
||||
base_str = base_str[:idx]
|
||||
return base_str
|
||||
|
||||
def _fetch_and_split(self):
|
||||
labels = []
|
||||
data = []
|
||||
|
||||
path_list = self._get_most_prolific_authors(self.n_authors)
|
||||
emails_authors = Parallel(n_jobs=-1)(
|
||||
delayed(_fetch_emails_from_author)(author_path, EnronMail.MIN_TOKENS) for author_path in path_list
|
||||
)
|
||||
for emails, author in emails_authors:
|
||||
data.extend(emails)
|
||||
labels.extend([author]*len(emails))
|
||||
|
||||
target_names = sorted(np.unique(labels))
|
||||
|
||||
train_data, test_data, train_labels, test_labels = \
|
||||
train_test_split(data, labels, test_size=EnronMail.TEST_SIZE, stratify=labels)
|
||||
|
||||
return LabelledCorpus(train_data, train_labels), LabelledCorpus(test_data, test_labels), target_names
|
||||
|
||||
def _check_n_authors(self, n_authors, n_open_set_authors):
|
||||
pass
|
||||
|
||||
def _get_most_prolific_authors(self, n):
|
||||
assert n <= EnronMail.NUM_AUTHORS, f'too many authors requested (maximum is {EnronMail.NUM_AUTHORS})'
|
||||
author_paths = glob(self.mail_dir)
|
||||
if n == -1:
|
||||
return author_paths
|
||||
author_count = Counter(
|
||||
{author_path :
|
||||
len(glob(f'{author_path}/sent/*.')) +
|
||||
len(glob(f'{author_path}/sent_items/*.'))
|
||||
for author_path in author_paths
|
||||
})
|
||||
return [path for path, count in author_count.most_common(n)]
|
||||
|
||||
|
||||
def _fetch_emails_from_author(author_path, min_tokens):
|
||||
subject_filters = ['fw:', 'fwd:', 're:']
|
||||
body_filters = ['-----Original Message-----', '----- Forward', 'cc:', 'To:', 'to:', 'From:', 'from:']
|
||||
|
||||
parsed_mails = 0
|
||||
author_mails = []
|
||||
author_docs = 0
|
||||
author_name = author_path[author_path.rindex('/') + 1:]
|
||||
author_bar = tqdm(list(glob(f'{author_path}/sent/*.')) + list(glob(f'{author_path}/sent_items/*.')))
|
||||
errors, trimmed = 0, 0
|
||||
for email in author_bar:
|
||||
author_bar.set_description(f'parsing for {author_path} errors={errors} trimmed={trimmed}')
|
||||
raw_email = open(email, 'rb').read()
|
||||
try:
|
||||
parsed_mail = eml_parser.eml_parser.decode_email_b(raw_email, include_raw_body=True)
|
||||
# subject = parsed_mail['header']['subject']
|
||||
body = parsed_mail['body'][0]['content']
|
||||
|
||||
# for filter in subject_filters:
|
||||
# if filter in subject.lower():
|
||||
# continue
|
||||
|
||||
# for filter in body_filters:
|
||||
# body = self.filter(body, filter)
|
||||
|
||||
# body = subject+'\n'+body
|
||||
body_tokens = body.split()
|
||||
ntokens = len(body_tokens)
|
||||
if ntokens >= min_tokens:
|
||||
if ntokens > EnronMail.MAX_MAIL_LENGHT:
|
||||
trimmed += 1
|
||||
body = ' '.join(body_tokens[:EnronMail.MAX_MAIL_LENGHT])
|
||||
author_mails.append(body)
|
||||
author_docs += 1
|
||||
# if n_docs_by_author != -1 and author_docs >= n_docs_by_author:
|
||||
# add_author = True
|
||||
# break
|
||||
|
||||
parsed_mails += 1
|
||||
except Exception:
|
||||
errors += 1
|
||||
print(f'ERROR in file {email}')
|
||||
|
||||
return author_mails, author_name
|
|
@ -5,7 +5,7 @@ def evaluation(y_true, y_pred):
|
|||
acc = accuracy_score(y_true, y_pred)
|
||||
macrof1 = f1_score(y_true, y_pred, average='macro')
|
||||
microf1 = f1_score(y_true, y_pred, average='micro')
|
||||
print(f'acc={acc * 100:.2f}%')
|
||||
print(f'macro-f1={macrof1:.2f}')
|
||||
print(f'micro-f1={microf1:.2f}')
|
||||
print(f'acc={acc * 100:.4f}%')
|
||||
print(f'macro-f1={macrof1:.4f}')
|
||||
print(f'micro-f1={microf1:.4f}')
|
||||
return acc, macrof1, microf1
|
||||
|
|
15
src/main.py
15
src/main.py
|
@ -1,6 +1,7 @@
|
|||
import numpy as np
|
||||
|
||||
from data.AuthorshipDataset import AuthorshipDataset
|
||||
from data.fetch_imdb62 import Imdb62
|
||||
from data.fetch_enron_mail import EnronMail
|
||||
from index import Index
|
||||
from model.classifiers import AuthorshipAttributionClassifier, SameAuthorClassifier, FullAuthorClassifier
|
||||
from data.fetch_victorian import Victorian
|
||||
|
@ -17,8 +18,9 @@ pad_length=3000
|
|||
batch_size=50
|
||||
n_epochs=256
|
||||
bigrams=False
|
||||
n_authors=-1
|
||||
n_authors=50
|
||||
docs_by_author=-1
|
||||
seed=1
|
||||
|
||||
debug=False
|
||||
if debug:
|
||||
|
@ -28,8 +30,8 @@ if debug:
|
|||
pad_length=100
|
||||
batch_size=10
|
||||
n_epochs=20
|
||||
n_authors = 5
|
||||
docs_by_author = 10
|
||||
n_authors = 50
|
||||
docs_by_author = -1
|
||||
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device('cuda')
|
||||
|
@ -37,12 +39,15 @@ else:
|
|||
device = torch.device('cpu')
|
||||
print(f'running on {device}')
|
||||
|
||||
dataset = AuthorshipDataset.load(EnronMail, pickle_path=f'../pickles/EnronMail{n_authors}_{seed}.pickle', mail_dir='../../authorship_analysis/data/enron_mail_20150507/maildir/*', n_authors=n_authors, docs_by_author=-1, random_state=seed)
|
||||
#dataset = Victorian(data_path='../../authorship_analysis/data/victoria', n_authors=5, docs_by_author=25)
|
||||
dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
|
||||
#dataset = Imdb62(data_path='../../authorship_analysis/data/imdb62/imdb62.txt', n_authors=n_authors, docs_by_author=docs_by_author)
|
||||
Xtr, ytr = dataset.train.data, dataset.train.target
|
||||
Xte, yte = dataset.test.data, dataset.test.target
|
||||
A = np.unique(ytr)
|
||||
print(f'num authors={len(A)}')
|
||||
print(f'ntr = {len(Xtr)} nte = {len(Xte)}')
|
||||
#sys.exit(0)
|
||||
|
||||
index = Index(analyzer='char', ngram_range=(2,2) if bigrams else (1,1))
|
||||
Xtr = index.fit_transform(Xtr)
|
||||
|
|
Loading…
Reference in New Issue