This commit is contained in:
Alejandro Moreo Fernandez 2018-11-02 17:08:32 +01:00
parent 451dfd544d
commit 80956499d0
4 changed files with 140 additions and 77 deletions

View File

@ -1,30 +1,37 @@
from sklearn.svm import * from sklearn.svm import *
from sklearn.model_selection import cross_val_score, GridSearchCV from sklearn.model_selection import cross_val_score, GridSearchCV
from doc_representation import *
from sklearn.metrics import f1_score, make_scorer from sklearn.metrics import f1_score, make_scorer
from doc_representation import *
probability=False # TODO: add function words
# SVM = SVC # TODO: other split policies
SVM = LinearSVC # TODO: understand normalization
# TODO: mendel hall
# TODO: wrap into an Estimator
probability=True
SVM = SVC
# SVM = LinearSVC
nfolds = 3 nfolds = 3
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]} params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
if SVM is SVC: if SVM is SVC:
params['kernel']=['linear','rbf'] params['kernel']=['linear','rbf']
path = '../testi' path = '../testi'
Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1) reader = LoadDocuments(split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1, split_policy=split_by_endline, normalize_features=True)
Xtr,ytr,ep1,ep2 = reader.load(path)
# learn a SVM # learn a SVM
# svm = SVM(probability=probability) svm = SVM(probability=probability)
svm = SVM() # svm = SVM()
positive_examples = ytr.sum() positive_examples = ytr.sum()
if positive_examples>nfolds: if positive_examples>nfolds:
print('optimizing {}'.format(svm.__class__.__name__)) print('optimizing {}'.format(svm.__class__.__name__))
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score)) svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
svm.fit(Xtr, ytr) svm.fit(Xtr, ytr)

View File

@ -0,0 +1,3 @@
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

View File

@ -1,3 +1,4 @@
import disable_sklearn_warnings
import nltk import nltk
import numpy as np import numpy as np
import os import os
@ -5,7 +6,9 @@ from os.path import join
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 from sklearn.feature_selection import chi2
from scipy.sparse import hstack, csr_matrix from sklearn.preprocessing import normalize
from scipy.sparse import hstack, csr_matrix, issparse
import matplotlib.pyplot as plt
function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"] function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
@ -40,11 +43,15 @@ def _load_texts(path):
# split policies # split policies
# ------------------------------------------------------------------------ # ------------------------------------------------------------------------
# TODO: implement other split policies (e.g., overlapping ones, etc) # TODO: implement other split policies (e.g., overlapping ones, etc)
def _split_by_endline(text): def split_by_endline(text):
return [t.strip() for t in text.split('\n') if t.strip()] return [t.strip() for t in text.split('\n') if t.strip()]
def splitter(documents, authors=None, split_policy=_split_by_endline): def split_by_sentences(text):
pass
def splitter(documents, authors=None, split_policy=split_by_endline):
fragments = [] fragments = []
authors_fragments = [] authors_fragments = []
for i, text in enumerate(documents): for i, text in enumerate(documents):
@ -71,6 +78,7 @@ def _features_function_words_freq(documents):
for text in documents: for text in documents:
tokens = nltk.word_tokenize(text) tokens = nltk.word_tokenize(text)
author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)]) author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
# author_tokens = ([token.lower() for token in tokens])
freqs = nltk.FreqDist(author_tokens) freqs = nltk.FreqDist(author_tokens)
nwords = len(author_tokens) nwords = len(author_tokens)
@ -96,6 +104,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None):
return features, tfidf_vectorizer return features, tfidf_vectorizer
def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio): def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
nF = X.shape[1] nF = X.shape[1]
num_feats = int(tfidf_feat_selection_ratio * nF) num_feats = int(tfidf_feat_selection_ratio * nF)
@ -105,84 +114,125 @@ def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
EP2 = feature_selector.transform(EP2) EP2 = feature_selector.transform(EP2)
return X,EP1,EP2 return X,EP1,EP2
def load_documents(path,
def _features_mendel_hall(documents):
raise NotImplementedError('not yet implemented')
pass
class LoadDocuments:
def __init__(self,
function_words_freq=True, function_words_freq=True,
tfidf=False, tfidf=False,
tfidf_feat_selection_ratio=1., tfidf_feat_selection_ratio=1.,
mendelhall=False,
split_documents=False, split_documents=False,
split_policy = _split_by_endline, split_policy = split_by_endline,
normalize_features=True,
verbose=True): verbose=True):
""" """
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined. EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
:param path: the path containing the texts, each named as <author>_<text_name>.txt :param path: the path containing the texts, each named as <author>_<text_name>.txt
:param function_words_freq: add the frequency of function words as features :param function_words_freq: add the frequency of function words as features
:param tfidf: add the tfidf as features :param tfidf: add the tfidf as features
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n'). :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
full documents, which are anyway retained). full documents, which are anyway retained).
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False) :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
:param verbose: show information by stdout or not :param verbose: show information by stdout or not
:return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
matrix of features for the training set and y are the labels (np.array); matrix of features for the training set and y are the labels (np.array);
EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
split_documents=True) and 2 (similar) split_documents=True) and 2 (similar)
""" """
documents, authors, ep1_text, ep2_text = _load_texts(path) self.normalize_features=normalize_features
ep1,ep2 = [ep1_text],[ep2_text] self.split_documents = split_documents
n_original_docs=len(documents) self.split_policy = split_policy
self.function_words_freq=function_words_freq
self.mendelhall = mendelhall
self.tfidf = tfidf
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
self.verbose = verbose
if split_documents: def load(self, path):
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy) documents, authors, ep1_text, ep2_text = _load_texts(path)
documents.extend(doc_fragments) ep1,ep2 = [ep1_text],[ep2_text]
authors.extend(authors_fragments) n_original_docs=len(documents)
ep1.extend(splitter(ep1, split_policy=split_policy)) if self.split_documents:
ep2.extend(splitter(ep2, split_policy=split_policy)) doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy)
documents.extend(doc_fragments)
authors.extend(authors_fragments)
ep1.extend(splitter(ep1, split_policy=self.split_policy))
ep2.extend(splitter(ep2, split_policy=self.split_policy))
# represent the target vector
y = np.array([(1 if author == "Dante" else 0) for author in authors])
# initialize the document-by-feature vector
X = np.empty((len(documents), 0))
EP1 = np.empty((len(ep1), 0))
EP2 = np.empty((len(ep2), 0))
# dense feature extraction functions
if self.function_words_freq:
X = self.addfeatures(X,_features_function_words_freq(documents))
EP1 = self.addfeatures(EP1, _features_function_words_freq(ep1))
EP2 = self.addfeatures(EP2, _features_function_words_freq(ep2))
if self.mendelhall:
X = self.addfeatures(X, _features_mendel_hall(documents))
EP1 = self.addfeatures(EP1, _features_mendel_hall(ep1))
EP2 = self.addfeatures(EP2, _features_mendel_hall(ep2))
# sparse feature extraction functions
if self.tfidf:
X_features, vectorizer = _features_tfidf(documents)
ep1_features, _ = _features_tfidf(ep1, vectorizer)
ep2_features, _ = _features_tfidf(ep2, vectorizer)
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
X_features, ep1_features, ep2_features = \
_feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
# matrix is sparse now
X = self.addfeatures(csr_matrix(X), X_features)
EP1 = self.addfeatures(csr_matrix(EP1), ep1_features)
EP2 = self.addfeatures(csr_matrix(EP2), ep2_features)
# represent the target vector # print summary
y = np.array([(1 if author == "Dante" else 0) for author in authors]) if self.verbose:
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
.format(self.function_words_freq, self.tfidf, self.split_documents, self.split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {:.2f}%'.format(y.mean()*100))
print('Epistola 1 shape:', EP1.shape)
print('Epistola 2 shape:', EP2.shape)
print()
# initialize the document-by-feature vector return X, y, EP1, EP2
X = np.empty((len(documents), 0))
EP1 = np.empty((len(ep1), 0))
EP2 = np.empty((len(ep2), 0))
if function_words_freq: def addfeatures(self, X, F):
X = np.hstack((X,_features_function_words_freq(documents))) # plt.matshow(F[:25])
EP1 = np.hstack((EP1, _features_function_words_freq(ep1))) # plt.show()
EP2 = np.hstack((EP2, _features_function_words_freq(ep2))) if self.normalize_features:
normalize(F, axis=1, copy=False)
if tfidf: if issparse(F):
X_features, vectorizer = _features_tfidf(documents) return hstack((X, F)) # sparse
ep1_features, _ = _features_tfidf(ep1, vectorizer) else:
ep2_features, _ = _features_tfidf(ep2, vectorizer) return np.hstack((X, F)) # dense
if tfidf_feat_selection_ratio < 1.:
if verbose: print('feature selection')
X_features, ep1_features, ep2_features = \
_feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
# matrix is sparse now
X = hstack((csr_matrix(X), X_features))
EP1 = hstack((csr_matrix(EP1), ep1_features))
EP2 = hstack((csr_matrix(EP2), ep2_features))
# print summary
if verbose:
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
.format(function_words_freq, tfidf, split_documents, split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {:.2f}%'.format(y.mean()*100))
print('Epistola 1 shape:', EP1.shape)
print('Epistola 2 shape:', EP2.shape)
print()
return X, y, EP1, EP2

3
src/requisites.txt Normal file
View File

@ -0,0 +1,3 @@
skelearn >= 0.19.1
scipy >= 1.0.0
numpy >= 1.15.2