update
This commit is contained in:
parent
451dfd544d
commit
80956499d0
|
|
@ -1,30 +1,37 @@
|
|||
from sklearn.svm import *
|
||||
from sklearn.model_selection import cross_val_score, GridSearchCV
|
||||
from doc_representation import *
|
||||
from sklearn.metrics import f1_score, make_scorer
|
||||
from doc_representation import *
|
||||
|
||||
probability=False
|
||||
# SVM = SVC
|
||||
SVM = LinearSVC
|
||||
# TODO: add function words
|
||||
# TODO: other split policies
|
||||
# TODO: understand normalization
|
||||
# TODO: mendel hall
|
||||
# TODO: wrap into an Estimator
|
||||
|
||||
probability=True
|
||||
SVM = SVC
|
||||
# SVM = LinearSVC
|
||||
|
||||
nfolds = 3
|
||||
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
|
||||
if SVM is SVC:
|
||||
params['kernel']=['linear','rbf']
|
||||
params['kernel']=['linear','rbf']
|
||||
|
||||
path = '../testi'
|
||||
|
||||
Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
|
||||
reader = LoadDocuments(split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1, split_policy=split_by_endline, normalize_features=True)
|
||||
Xtr,ytr,ep1,ep2 = reader.load(path)
|
||||
|
||||
# learn a SVM
|
||||
|
||||
# svm = SVM(probability=probability)
|
||||
svm = SVM()
|
||||
svm = SVM(probability=probability)
|
||||
# svm = SVM()
|
||||
|
||||
positive_examples = ytr.sum()
|
||||
if positive_examples>nfolds:
|
||||
print('optimizing {}'.format(svm.__class__.__name__))
|
||||
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
|
||||
svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
|
||||
|
||||
svm.fit(Xtr, ytr)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
def warn(*args, **kwargs): pass
|
||||
import warnings
|
||||
warnings.warn = warn
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
import disable_sklearn_warnings
|
||||
import nltk
|
||||
import numpy as np
|
||||
import os
|
||||
|
|
@ -5,7 +6,9 @@ from os.path import join
|
|||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.feature_selection import SelectKBest
|
||||
from sklearn.feature_selection import chi2
|
||||
from scipy.sparse import hstack, csr_matrix
|
||||
from sklearn.preprocessing import normalize
|
||||
from scipy.sparse import hstack, csr_matrix, issparse
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
|
||||
|
||||
|
|
@ -40,11 +43,15 @@ def _load_texts(path):
|
|||
# split policies
|
||||
# ------------------------------------------------------------------------
|
||||
# TODO: implement other split policies (e.g., overlapping ones, etc)
|
||||
def _split_by_endline(text):
|
||||
def split_by_endline(text):
|
||||
return [t.strip() for t in text.split('\n') if t.strip()]
|
||||
|
||||
|
||||
def splitter(documents, authors=None, split_policy=_split_by_endline):
|
||||
def split_by_sentences(text):
|
||||
pass
|
||||
|
||||
|
||||
def splitter(documents, authors=None, split_policy=split_by_endline):
|
||||
fragments = []
|
||||
authors_fragments = []
|
||||
for i, text in enumerate(documents):
|
||||
|
|
@ -71,6 +78,7 @@ def _features_function_words_freq(documents):
|
|||
for text in documents:
|
||||
tokens = nltk.word_tokenize(text)
|
||||
author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
|
||||
# author_tokens = ([token.lower() for token in tokens])
|
||||
freqs = nltk.FreqDist(author_tokens)
|
||||
|
||||
nwords = len(author_tokens)
|
||||
|
|
@ -96,6 +104,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None):
|
|||
|
||||
return features, tfidf_vectorizer
|
||||
|
||||
|
||||
def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
|
||||
nF = X.shape[1]
|
||||
num_feats = int(tfidf_feat_selection_ratio * nF)
|
||||
|
|
@ -105,84 +114,125 @@ def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
|
|||
EP2 = feature_selector.transform(EP2)
|
||||
return X,EP1,EP2
|
||||
|
||||
def load_documents(path,
|
||||
|
||||
def _features_mendel_hall(documents):
|
||||
raise NotImplementedError('not yet implemented')
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class LoadDocuments:
|
||||
def __init__(self,
|
||||
function_words_freq=True,
|
||||
tfidf=False,
|
||||
tfidf_feat_selection_ratio=1.,
|
||||
mendelhall=False,
|
||||
split_documents=False,
|
||||
split_policy = _split_by_endline,
|
||||
split_policy = split_by_endline,
|
||||
normalize_features=True,
|
||||
verbose=True):
|
||||
"""
|
||||
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
|
||||
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
|
||||
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
|
||||
:param path: the path containing the texts, each named as <author>_<text_name>.txt
|
||||
:param function_words_freq: add the frequency of function words as features
|
||||
:param tfidf: add the tfidf as features
|
||||
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
|
||||
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
|
||||
full documents, which are anyway retained).
|
||||
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
|
||||
:param verbose: show information by stdout or not
|
||||
:return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
|
||||
matrix of features for the training set and y are the labels (np.array);
|
||||
EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
|
||||
split_documents=True) and 2 (similar)
|
||||
"""
|
||||
"""
|
||||
Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
|
||||
contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
|
||||
EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
|
||||
:param path: the path containing the texts, each named as <author>_<text_name>.txt
|
||||
:param function_words_freq: add the frequency of function words as features
|
||||
:param tfidf: add the tfidf as features
|
||||
:param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
|
||||
Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
|
||||
full documents, which are anyway retained).
|
||||
:param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
|
||||
:param verbose: show information by stdout or not
|
||||
:return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
|
||||
matrix of features for the training set and y are the labels (np.array);
|
||||
EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
|
||||
split_documents=True) and 2 (similar)
|
||||
"""
|
||||
|
||||
documents, authors, ep1_text, ep2_text = _load_texts(path)
|
||||
ep1,ep2 = [ep1_text],[ep2_text]
|
||||
n_original_docs=len(documents)
|
||||
self.normalize_features=normalize_features
|
||||
self.split_documents = split_documents
|
||||
self.split_policy = split_policy
|
||||
self.function_words_freq=function_words_freq
|
||||
self.mendelhall = mendelhall
|
||||
self.tfidf = tfidf
|
||||
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
|
||||
self.verbose = verbose
|
||||
|
||||
if split_documents:
|
||||
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
|
||||
documents.extend(doc_fragments)
|
||||
authors.extend(authors_fragments)
|
||||
def load(self, path):
|
||||
documents, authors, ep1_text, ep2_text = _load_texts(path)
|
||||
ep1,ep2 = [ep1_text],[ep2_text]
|
||||
n_original_docs=len(documents)
|
||||
|
||||
ep1.extend(splitter(ep1, split_policy=split_policy))
|
||||
ep2.extend(splitter(ep2, split_policy=split_policy))
|
||||
if self.split_documents:
|
||||
doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy)
|
||||
documents.extend(doc_fragments)
|
||||
authors.extend(authors_fragments)
|
||||
|
||||
ep1.extend(splitter(ep1, split_policy=self.split_policy))
|
||||
ep2.extend(splitter(ep2, split_policy=self.split_policy))
|
||||
|
||||
# represent the target vector
|
||||
y = np.array([(1 if author == "Dante" else 0) for author in authors])
|
||||
|
||||
# initialize the document-by-feature vector
|
||||
X = np.empty((len(documents), 0))
|
||||
EP1 = np.empty((len(ep1), 0))
|
||||
EP2 = np.empty((len(ep2), 0))
|
||||
|
||||
# dense feature extraction functions
|
||||
if self.function_words_freq:
|
||||
X = self.addfeatures(X,_features_function_words_freq(documents))
|
||||
EP1 = self.addfeatures(EP1, _features_function_words_freq(ep1))
|
||||
EP2 = self.addfeatures(EP2, _features_function_words_freq(ep2))
|
||||
|
||||
if self.mendelhall:
|
||||
X = self.addfeatures(X, _features_mendel_hall(documents))
|
||||
EP1 = self.addfeatures(EP1, _features_mendel_hall(ep1))
|
||||
EP2 = self.addfeatures(EP2, _features_mendel_hall(ep2))
|
||||
|
||||
# sparse feature extraction functions
|
||||
if self.tfidf:
|
||||
X_features, vectorizer = _features_tfidf(documents)
|
||||
ep1_features, _ = _features_tfidf(ep1, vectorizer)
|
||||
ep2_features, _ = _features_tfidf(ep2, vectorizer)
|
||||
|
||||
if self.tfidf_feat_selection_ratio < 1.:
|
||||
if self.verbose: print('feature selection')
|
||||
X_features, ep1_features, ep2_features = \
|
||||
_feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
|
||||
|
||||
# matrix is sparse now
|
||||
X = self.addfeatures(csr_matrix(X), X_features)
|
||||
EP1 = self.addfeatures(csr_matrix(EP1), ep1_features)
|
||||
EP2 = self.addfeatures(csr_matrix(EP2), ep2_features)
|
||||
|
||||
|
||||
# represent the target vector
|
||||
y = np.array([(1 if author == "Dante" else 0) for author in authors])
|
||||
# print summary
|
||||
if self.verbose:
|
||||
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
|
||||
.format(self.function_words_freq, self.tfidf, self.split_documents, self.split_policy.__name__))
|
||||
print('number of training (full) documents: {}'.format(n_original_docs))
|
||||
print('X shape (#documents,#features): {}'.format(X.shape))
|
||||
print('y prevalence: {:.2f}%'.format(y.mean()*100))
|
||||
print('Epistola 1 shape:', EP1.shape)
|
||||
print('Epistola 2 shape:', EP2.shape)
|
||||
print()
|
||||
|
||||
# initialize the document-by-feature vector
|
||||
X = np.empty((len(documents), 0))
|
||||
EP1 = np.empty((len(ep1), 0))
|
||||
EP2 = np.empty((len(ep2), 0))
|
||||
return X, y, EP1, EP2
|
||||
|
||||
if function_words_freq:
|
||||
X = np.hstack((X,_features_function_words_freq(documents)))
|
||||
EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
|
||||
EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
|
||||
def addfeatures(self, X, F):
|
||||
# plt.matshow(F[:25])
|
||||
# plt.show()
|
||||
if self.normalize_features:
|
||||
normalize(F, axis=1, copy=False)
|
||||
|
||||
if tfidf:
|
||||
X_features, vectorizer = _features_tfidf(documents)
|
||||
ep1_features, _ = _features_tfidf(ep1, vectorizer)
|
||||
ep2_features, _ = _features_tfidf(ep2, vectorizer)
|
||||
if issparse(F):
|
||||
return hstack((X, F)) # sparse
|
||||
else:
|
||||
return np.hstack((X, F)) # dense
|
||||
|
||||
if tfidf_feat_selection_ratio < 1.:
|
||||
if verbose: print('feature selection')
|
||||
X_features, ep1_features, ep2_features = \
|
||||
_feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
|
||||
|
||||
# matrix is sparse now
|
||||
X = hstack((csr_matrix(X), X_features))
|
||||
EP1 = hstack((csr_matrix(EP1), ep1_features))
|
||||
EP2 = hstack((csr_matrix(EP2), ep2_features))
|
||||
|
||||
|
||||
# print summary
|
||||
if verbose:
|
||||
print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
|
||||
.format(function_words_freq, tfidf, split_documents, split_policy.__name__))
|
||||
print('number of training (full) documents: {}'.format(n_original_docs))
|
||||
print('X shape (#documents,#features): {}'.format(X.shape))
|
||||
print('y prevalence: {:.2f}%'.format(y.mean()*100))
|
||||
print('Epistola 1 shape:', EP1.shape)
|
||||
print('Epistola 2 shape:', EP2.shape)
|
||||
print()
|
||||
|
||||
return X, y, EP1, EP2
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
skelearn >= 0.19.1
|
||||
scipy >= 1.0.0
|
||||
numpy >= 1.15.2
|
||||
Loading…
Reference in New Issue