This commit is contained in:
Alejandro Moreo Fernandez 2018-11-02 17:08:32 +01:00
parent 451dfd544d
commit 80956499d0
4 changed files with 140 additions and 77 deletions

View File

@ -1,11 +1,17 @@
from sklearn.svm import *
from sklearn.model_selection import cross_val_score, GridSearchCV
from doc_representation import *
from sklearn.metrics import f1_score, make_scorer
from doc_representation import *
probability=False
# SVM = SVC
SVM = LinearSVC
# TODO: add function words
# TODO: other split policies
# TODO: understand normalization
# TODO: mendel hall
# TODO: wrap into an Estimator
probability=True
SVM = SVC
# SVM = LinearSVC
nfolds = 3
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
@ -14,17 +20,18 @@ if SVM is SVC:
path = '../testi'
Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
reader = LoadDocuments(split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1, split_policy=split_by_endline, normalize_features=True)
Xtr,ytr,ep1,ep2 = reader.load(path)
# learn a SVM
# svm = SVM(probability=probability)
svm = SVM()
svm = SVM(probability=probability)
# svm = SVM()
positive_examples = ytr.sum()
if positive_examples>nfolds:
print('optimizing {}'.format(svm.__class__.__name__))
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
svm.fit(Xtr, ytr)

View File

@ -0,0 +1,3 @@
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

View File

@ -1,3 +1,4 @@
import disable_sklearn_warnings
import nltk
import numpy as np
import os
@ -5,7 +6,9 @@ from os.path import join
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import normalize
from scipy.sparse import hstack, csr_matrix, issparse
import matplotlib.pyplot as plt
function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
@ -40,11 +43,15 @@ def _load_texts(path):
# split policies
# ------------------------------------------------------------------------
# TODO: implement other split policies (e.g., overlapping ones, etc)
def _split_by_endline(text):
def split_by_endline(text):
return [t.strip() for t in text.split('\n') if t.strip()]
def splitter(documents, authors=None, split_policy=_split_by_endline):
def split_by_sentences(text):
pass
def splitter(documents, authors=None, split_policy=split_by_endline):
fragments = []
authors_fragments = []
for i, text in enumerate(documents):
@ -71,6 +78,7 @@ def _features_function_words_freq(documents):
for text in documents:
tokens = nltk.word_tokenize(text)
author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
# author_tokens = ([token.lower() for token in tokens])
freqs = nltk.FreqDist(author_tokens)
nwords = len(author_tokens)
@ -96,6 +104,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None):
return features, tfidf_vectorizer
def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
nF = X.shape[1]
num_feats = int(tfidf_feat_selection_ratio * nF)
@ -105,12 +114,25 @@ def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
EP2 = feature_selector.transform(EP2)
return X,EP1,EP2
def load_documents(path,
def _features_mendel_hall(documents):
raise NotImplementedError('not yet implemented')
pass
class LoadDocuments:
def __init__(self,
function_words_freq=True,
tfidf=False,
tfidf_feat_selection_ratio=1.,
mendelhall=False,
split_documents=False,
split_policy = _split_by_endline,
split_policy = split_by_endline,
normalize_features=True,
verbose=True):
"""
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
@ -130,18 +152,27 @@ def load_documents(path,
split_documents=True) and 2 (similar)
"""
self.normalize_features=normalize_features
self.split_documents = split_documents
self.split_policy = split_policy
self.function_words_freq=function_words_freq
self.mendelhall = mendelhall
self.tfidf = tfidf
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
self.verbose = verbose
def load(self, path):
documents, authors, ep1_text, ep2_text = _load_texts(path)
ep1,ep2 = [ep1_text],[ep2_text]
n_original_docs=len(documents)
if split_documents:
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
if self.split_documents:
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy)
documents.extend(doc_fragments)
authors.extend(authors_fragments)
ep1.extend(splitter(ep1, split_policy=split_policy))
ep2.extend(splitter(ep2, split_policy=split_policy))
ep1.extend(splitter(ep1, split_policy=self.split_policy))
ep2.extend(splitter(ep2, split_policy=self.split_policy))
# represent the target vector
y = np.array([(1 if author == "Dante" else 0) for author in authors])
@ -151,31 +182,38 @@ def load_documents(path,
EP1 = np.empty((len(ep1), 0))
EP2 = np.empty((len(ep2), 0))
if function_words_freq:
X = np.hstack((X,_features_function_words_freq(documents)))
EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
# dense feature extraction functions
if self.function_words_freq:
X = self.addfeatures(X,_features_function_words_freq(documents))
EP1 = self.addfeatures(EP1, _features_function_words_freq(ep1))
EP2 = self.addfeatures(EP2, _features_function_words_freq(ep2))
if tfidf:
if self.mendelhall:
X = self.addfeatures(X, _features_mendel_hall(documents))
EP1 = self.addfeatures(EP1, _features_mendel_hall(ep1))
EP2 = self.addfeatures(EP2, _features_mendel_hall(ep2))
# sparse feature extraction functions
if self.tfidf:
X_features, vectorizer = _features_tfidf(documents)
ep1_features, _ = _features_tfidf(ep1, vectorizer)
ep2_features, _ = _features_tfidf(ep2, vectorizer)
if tfidf_feat_selection_ratio < 1.:
if verbose: print('feature selection')
if self.tfidf_feat_selection_ratio < 1.:
if self.verbose: print('feature selection')
X_features, ep1_features, ep2_features = \
_feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
_feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
# matrix is sparse now
X = hstack((csr_matrix(X), X_features))
EP1 = hstack((csr_matrix(EP1), ep1_features))
EP2 = hstack((csr_matrix(EP2), ep2_features))
X = self.addfeatures(csr_matrix(X), X_features)
EP1 = self.addfeatures(csr_matrix(EP1), ep1_features)
EP2 = self.addfeatures(csr_matrix(EP2), ep2_features)
# print summary
if verbose:
if self.verbose:
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
.format(function_words_freq, tfidf, split_documents, split_policy.__name__))
.format(self.function_words_freq, self.tfidf, self.split_documents, self.split_policy.__name__))
print('number of training (full) documents: {}'.format(n_original_docs))
print('X shape (#documents,#features): {}'.format(X.shape))
print('y prevalence: {:.2f}%'.format(y.mean()*100))
@ -185,4 +223,16 @@ def load_documents(path,
return X, y, EP1, EP2
def addfeatures(self, X, F):
# plt.matshow(F[:25])
# plt.show()
if self.normalize_features:
normalize(F, axis=1, copy=False)
if issparse(F):
return hstack((X, F)) # sparse
else:
return np.hstack((X, F)) # dense

3
src/requisites.txt Normal file
View File

@ -0,0 +1,3 @@
skelearn >= 0.19.1
scipy >= 1.0.0
numpy >= 1.15.2