This commit is contained in:
Alejandro Moreo Fernandez 2018-11-02 17:08:32 +01:00
parent 451dfd544d
commit 80956499d0
4 changed files with 140 additions and 77 deletions

View File

@ -1,30 +1,37 @@
from sklearn.svm import *
from sklearn.model_selection import cross_val_score, GridSearchCV
from doc_representation import *
from sklearn.metrics import f1_score, make_scorer
from doc_representation import *
probability=False
# SVM = SVC
SVM = LinearSVC
# TODO: add function words
# TODO: other split policies
# TODO: understand normalization
# TODO: mendel hall
# TODO: wrap into an Estimator
probability=True
SVM = SVC
# SVM = LinearSVC
nfolds = 3
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
if SVM is SVC:
params['kernel']=['linear','rbf']
params['kernel']=['linear','rbf']
path = '../testi'
Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
reader = LoadDocuments(split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1, split_policy=split_by_endline, normalize_features=True)
Xtr,ytr,ep1,ep2 = reader.load(path)
# learn a SVM
# svm = SVM(probability=probability)
svm = SVM()
svm = SVM(probability=probability)
# svm = SVM()
positive_examples = ytr.sum()
if positive_examples>nfolds:
print('optimizing {}'.format(svm.__class__.__name__))
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
svm.fit(Xtr, ytr)

View File

@ -0,0 +1,3 @@
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

View File

@ -1,3 +1,4 @@
import disable_sklearn_warnings
import nltk
import numpy as np
import os
@ -5,7 +6,9 @@ from os.path import join
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import normalize
from scipy.sparse import hstack, csr_matrix, issparse
import matplotlib.pyplot as plt
function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
@ -40,11 +43,15 @@ def _load_texts(path):
# split policies
# ------------------------------------------------------------------------
# TODO: implement other split policies (e.g., overlapping ones, etc)
def _split_by_endline(text):
def split_by_endline(text):
return [t.strip() for t in text.split('\n') if t.strip()]
def splitter(documents, authors=None, split_policy=_split_by_endline):
def split_by_sentences(text):
pass
def splitter(documents, authors=None, split_policy=split_by_endline):
fragments = []
authors_fragments = []
for i, text in enumerate(documents):
@ -71,6 +78,7 @@ def _features_function_words_freq(documents):
for text in documents:
tokens = nltk.word_tokenize(text)
author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
# author_tokens = ([token.lower() for token in tokens])
freqs = nltk.FreqDist(author_tokens)
nwords = len(author_tokens)
@ -96,6 +104,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None):
return features, tfidf_vectorizer
def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
nF = X.shape[1]
num_feats = int(tfidf_feat_selection_ratio * nF)
@ -105,84 +114,125 @@ def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
EP2 = feature_selector.transform(EP2)
return X,EP1,EP2
def load_documents(path,
def _features_mendel_hall(documents):
raise NotImplementedError('not yet implemented')
pass
class LoadDocuments:
def __init__(self,
function_words_freq=True,
tfidf=False,
tfidf_feat_selection_ratio=1.,
mendelhall=False,
split_documents=False,
split_policy = _split_by_endline,
split_policy = split_by_endline,
normalize_features=True,
verbose=True):
"""
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
:param path: the path containing the texts, each named as <author>_<text_name>.txt
:param function_words_freq: add the frequency of function words as features
:param tfidf: add the tfidf as features
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
full documents, which are anyway retained).
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
:param verbose: show information by stdout or not
:return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
matrix of features for the training set and y are the labels (np.array);
EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
split_documents=True) and 2 (similar)
"""
"""
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
:param path: the path containing the texts, each named as <author>_<text_name>.txt
:param function_words_freq: add the frequency of function words as features
:param tfidf: add the tfidf as features
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
full documents, which are anyway retained).
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
:param verbose: show information by stdout or not
:return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
matrix of features for the training set and y are the labels (np.array);
EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
split_documents=True) and 2 (similar)
"""
documents, authors, ep1_text, ep2_text = _load_texts(path)
ep1,ep2 = [ep1_text],[ep2_text]
n_original_docs=len(documents)
self.normalize_features=normalize_features
self.split_documents = split_documents
self.split_policy = split_policy
self.function_words_freq=function_words_freq
self.mendelhall = mendelhall
self.tfidf = tfidf
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
self.verbose = verbose
if split_documents:
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
documents.extend(doc_fragments)
authors.extend(authors_fragments)
def load(self, path):
documents, authors, ep1_text, ep2_text = _load_texts(path)
ep1,ep2 = [ep1_text],[ep2_text]
n_original_docs=len(documents)
ep1.extend(splitter(ep1, split_policy=split_policy))
ep2.extend(splitter(ep2, split_policy=split_policy))
if self.split_documents:
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy)
documents.extend(doc_fragments)
authors.extend(authors_fragments)
ep1.extend(splitter(ep1, split_policy=self.split_policy))
ep2.extend(splitter(ep2, split_policy=self.split_policy))
# represent the target vector
y = np.array([(1 if author == "Dante" else 0) for author in authors])
# initialize the document-by-feature vector
X = np.empty((len(documents), 0))
EP1 = np.empty((len(ep1), 0))
EP2 = np.empty((len(ep2), 0))
# dense feature extraction functions
if self.function_words_freq:
X = self.addfeatures(X,_features_function_words_freq(documents))
EP1 = self.addfeatures(EP1, _features_function_words_freq(ep1))
EP2 = self.addfeatures(EP2, _features_function_words_freq(ep2))
if self.mendelhall:
X = self.addfeatures(X, _features_mendel_hall(documents))
EP1 = self.addfeatures(EP1, _features_mendel_hall(ep1))
EP2 = self.addfeatures(EP2, _features_mendel_hall(ep2))
# sparse feature extraction functions
if self.tfidf:
X_features, vectorizer = _features_tfidf(documents)
ep1_features, _ = _features_tfidf(ep1, vectorizer)
ep2_features, _ = _features_tfidf(ep2, vectorizer)
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
X_features, ep1_features, ep2_features = \
_feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
# matrix is sparse now
X = self.addfeatures(csr_matrix(X), X_features)
EP1 = self.addfeatures(csr_matrix(EP1), ep1_features)
EP2 = self.addfeatures(csr_matrix(EP2), ep2_features)
# represent the target vector
y = np.array([(1 if author == "Dante" else 0) for author in authors])
# print summary
if self.verbose:
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
.format(self.function_words_freq, self.tfidf, self.split_documents, self.split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {:.2f}%'.format(y.mean()*100))
print('Epistola 1 shape:', EP1.shape)
print('Epistola 2 shape:', EP2.shape)
print()
# initialize the document-by-feature vector
X = np.empty((len(documents), 0))
EP1 = np.empty((len(ep1), 0))
EP2 = np.empty((len(ep2), 0))
return X, y, EP1, EP2
if function_words_freq:
X = np.hstack((X,_features_function_words_freq(documents)))
EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
def addfeatures(self, X, F):
# plt.matshow(F[:25])
# plt.show()
if self.normalize_features:
normalize(F, axis=1, copy=False)
if tfidf:
X_features, vectorizer = _features_tfidf(documents)
ep1_features, _ = _features_tfidf(ep1, vectorizer)
ep2_features, _ = _features_tfidf(ep2, vectorizer)
if issparse(F):
return hstack((X, F)) # sparse
else:
return np.hstack((X, F)) # dense
if tfidf_feat_selection_ratio < 1.:
if verbose: print('feature selection')
X_features, ep1_features, ep2_features = \
_feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
# matrix is sparse now
X = hstack((csr_matrix(X), X_features))
EP1 = hstack((csr_matrix(EP1), ep1_features))
EP2 = hstack((csr_matrix(EP2), ep2_features))
# print summary
if verbose:
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
.format(function_words_freq, tfidf, split_documents, split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {:.2f}%'.format(y.mean()*100))
print('Epistola 1 shape:', EP1.shape)
print('Epistola 2 shape:', EP2.shape)
print()
return X, y, EP1, EP2

3
src/requisites.txt Normal file
View File

@ -0,0 +1,3 @@
skelearn >= 0.19.1
scipy >= 1.0.0
numpy >= 1.15.2