diff --git a/src/classifier.py b/src/classifier.py index e4b00e2..c23fc08 100644 --- a/src/classifier.py +++ b/src/classifier.py @@ -1,30 +1,37 @@ from sklearn.svm import * from sklearn.model_selection import cross_val_score, GridSearchCV -from doc_representation import * from sklearn.metrics import f1_score, make_scorer +from doc_representation import * -probability=False -# SVM = SVC -SVM = LinearSVC +# TODO: add function words +# TODO: other split policies +# TODO: understand normalization +# TODO: mendel hall +# TODO: wrap into an Estimator + +probability=True +SVM = SVC +# SVM = LinearSVC nfolds = 3 params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]} if SVM is SVC: - params['kernel']=['linear','rbf'] + params['kernel']=['linear','rbf'] path = '../testi' -Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1) +reader = LoadDocuments(split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1, split_policy=split_by_endline, normalize_features=True) +Xtr,ytr,ep1,ep2 = reader.load(path) # learn a SVM -# svm = SVM(probability=probability) -svm = SVM() +svm = SVM(probability=probability) +# svm = SVM() positive_examples = ytr.sum() if positive_examples>nfolds: print('optimizing {}'.format(svm.__class__.__name__)) - svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score)) + svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1) svm.fit(Xtr, ytr) diff --git a/src/disable_sklearn_warnings.py b/src/disable_sklearn_warnings.py new file mode 100644 index 0000000..e669983 --- /dev/null +++ b/src/disable_sklearn_warnings.py @@ -0,0 +1,3 @@ +def warn(*args, **kwargs): pass +import warnings +warnings.warn = warn diff --git a/src/doc_representation.py b/src/doc_representation.py index d194d95..abaaf2e 100644 --- a/src/doc_representation.py +++ b/src/doc_representation.py @@ -1,3 +1,4 @@ +import disable_sklearn_warnings import nltk import numpy as np import os @@ -5,7 +6,9 @@ from os.path import join from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 -from scipy.sparse import hstack, csr_matrix +from sklearn.preprocessing import normalize +from scipy.sparse import hstack, csr_matrix, issparse +import matplotlib.pyplot as plt function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"] @@ -40,11 +43,15 @@ def _load_texts(path): # split policies # ------------------------------------------------------------------------ # TODO: implement other split policies (e.g., overlapping ones, etc) -def _split_by_endline(text): +def split_by_endline(text): return [t.strip() for t in text.split('\n') if t.strip()] -def splitter(documents, authors=None, split_policy=_split_by_endline): +def split_by_sentences(text): + pass + + +def splitter(documents, authors=None, split_policy=split_by_endline): fragments = [] authors_fragments = [] for i, text in enumerate(documents): @@ -71,6 +78,7 @@ def _features_function_words_freq(documents): for text in documents: tokens = nltk.word_tokenize(text) author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)]) + # author_tokens = ([token.lower() for token in tokens]) freqs = nltk.FreqDist(author_tokens) nwords = len(author_tokens) @@ -96,6 +104,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None): return features, tfidf_vectorizer + def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio): nF = X.shape[1] num_feats = int(tfidf_feat_selection_ratio * nF) @@ -105,84 +114,125 @@ def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio): EP2 = feature_selector.transform(EP2) return X,EP1,EP2 -def load_documents(path, + +def _features_mendel_hall(documents): + raise NotImplementedError('not yet implemented') + pass + + + + + + +class LoadDocuments: + def __init__(self, function_words_freq=True, tfidf=False, tfidf_feat_selection_ratio=1., + mendelhall=False, split_documents=False, - split_policy = _split_by_endline, + split_policy = split_by_endline, + normalize_features=True, verbose=True): - """ - Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to - contain files named according to _.txt plus two special files EpistolaXIII_1.txt and - EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined. - :param path: the path containing the texts, each named as _.txt - :param function_words_freq: add the frequency of function words as features - :param tfidf: add the tfidf as features - :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n'). - Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the - full documents, which are anyway retained). - :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False) - :param verbose: show information by stdout or not - :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the - matrix of features for the training set and y are the labels (np.array); - EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if - split_documents=True) and 2 (similar) - """ + """ + Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to + contain files named according to _.txt plus two special files EpistolaXIII_1.txt and + EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined. + :param path: the path containing the texts, each named as _.txt + :param function_words_freq: add the frequency of function words as features + :param tfidf: add the tfidf as features + :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n'). + Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the + full documents, which are anyway retained). + :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False) + :param verbose: show information by stdout or not + :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the + matrix of features for the training set and y are the labels (np.array); + EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if + split_documents=True) and 2 (similar) + """ - documents, authors, ep1_text, ep2_text = _load_texts(path) - ep1,ep2 = [ep1_text],[ep2_text] - n_original_docs=len(documents) + self.normalize_features=normalize_features + self.split_documents = split_documents + self.split_policy = split_policy + self.function_words_freq=function_words_freq + self.mendelhall = mendelhall + self.tfidf = tfidf + self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio + self.verbose = verbose - if split_documents: - doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy) - documents.extend(doc_fragments) - authors.extend(authors_fragments) + def load(self, path): + documents, authors, ep1_text, ep2_text = _load_texts(path) + ep1,ep2 = [ep1_text],[ep2_text] + n_original_docs=len(documents) - ep1.extend(splitter(ep1, split_policy=split_policy)) - ep2.extend(splitter(ep2, split_policy=split_policy)) + if self.split_documents: + doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy) + documents.extend(doc_fragments) + authors.extend(authors_fragments) + + ep1.extend(splitter(ep1, split_policy=self.split_policy)) + ep2.extend(splitter(ep2, split_policy=self.split_policy)) + + # represent the target vector + y = np.array([(1 if author == "Dante" else 0) for author in authors]) + + # initialize the document-by-feature vector + X = np.empty((len(documents), 0)) + EP1 = np.empty((len(ep1), 0)) + EP2 = np.empty((len(ep2), 0)) + + # dense feature extraction functions + if self.function_words_freq: + X = self.addfeatures(X,_features_function_words_freq(documents)) + EP1 = self.addfeatures(EP1, _features_function_words_freq(ep1)) + EP2 = self.addfeatures(EP2, _features_function_words_freq(ep2)) + + if self.mendelhall: + X = self.addfeatures(X, _features_mendel_hall(documents)) + EP1 = self.addfeatures(EP1, _features_mendel_hall(ep1)) + EP2 = self.addfeatures(EP2, _features_mendel_hall(ep2)) + + # sparse feature extraction functions + if self.tfidf: + X_features, vectorizer = _features_tfidf(documents) + ep1_features, _ = _features_tfidf(ep1, vectorizer) + ep2_features, _ = _features_tfidf(ep2, vectorizer) + + if self.tfidf_feat_selection_ratio < 1.: + if self.verbose: print('feature selection') + X_features, ep1_features, ep2_features = \ + _feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio) + + # matrix is sparse now + X = self.addfeatures(csr_matrix(X), X_features) + EP1 = self.addfeatures(csr_matrix(EP1), ep1_features) + EP2 = self.addfeatures(csr_matrix(EP2), ep2_features) - # represent the target vector - y = np.array([(1 if author == "Dante" else 0) for author in authors]) + # print summary + if self.verbose: + print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}' + .format(self.function_words_freq, self.tfidf, self.split_documents, self.split_policy.__name__)) + print('number of training (full) documents: {}'.format(n_original_docs)) + print('X shape (#documents,#features): {}'.format(X.shape)) + print('y prevalence: {:.2f}%'.format(y.mean()*100)) + print('Epistola 1 shape:', EP1.shape) + print('Epistola 2 shape:', EP2.shape) + print() - # initialize the document-by-feature vector - X = np.empty((len(documents), 0)) - EP1 = np.empty((len(ep1), 0)) - EP2 = np.empty((len(ep2), 0)) + return X, y, EP1, EP2 - if function_words_freq: - X = np.hstack((X,_features_function_words_freq(documents))) - EP1 = np.hstack((EP1, _features_function_words_freq(ep1))) - EP2 = np.hstack((EP2, _features_function_words_freq(ep2))) + def addfeatures(self, X, F): + # plt.matshow(F[:25]) + # plt.show() + if self.normalize_features: + normalize(F, axis=1, copy=False) - if tfidf: - X_features, vectorizer = _features_tfidf(documents) - ep1_features, _ = _features_tfidf(ep1, vectorizer) - ep2_features, _ = _features_tfidf(ep2, vectorizer) + if issparse(F): + return hstack((X, F)) # sparse + else: + return np.hstack((X, F)) # dense - if tfidf_feat_selection_ratio < 1.: - if verbose: print('feature selection') - X_features, ep1_features, ep2_features = \ - _feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio) - - # matrix is sparse now - X = hstack((csr_matrix(X), X_features)) - EP1 = hstack((csr_matrix(EP1), ep1_features)) - EP2 = hstack((csr_matrix(EP2), ep2_features)) - - - # print summary - if verbose: - print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}' - .format(function_words_freq, tfidf, split_documents, split_policy.__name__)) - print('number of training (full) documents: {}'.format(n_original_docs)) - print('X shape (#documents,#features): {}'.format(X.shape)) - print('y prevalence: {:.2f}%'.format(y.mean()*100)) - print('Epistola 1 shape:', EP1.shape) - print('Epistola 2 shape:', EP2.shape) - print() - - return X, y, EP1, EP2 diff --git a/src/requisites.txt b/src/requisites.txt new file mode 100644 index 0000000..1793191 --- /dev/null +++ b/src/requisites.txt @@ -0,0 +1,3 @@ +skelearn >= 0.19.1 +scipy >= 1.0.0 +numpy >= 1.15.2