update

2018-11-02 17:08:32 +01:00 · 2018-11-02 17:08:32 +01:00 · 80956499d0
parent 451dfd544d
commit 80956499d0
4 changed files with 140 additions and 77 deletions
--- a/src/classifier.py
+++ b/src/classifier.py
@ -1,30 +1,37 @@
 from sklearn.svm import *
 from sklearn.model_selection import cross_val_score, GridSearchCV
-from doc_representation import *
 from sklearn.metrics import f1_score, make_scorer
+from doc_representation import *

-probability=False
-# SVM = SVC
-SVM = LinearSVC
+# TODO: add function words
+# TODO: other split policies
+# TODO: understand normalization
+# TODO: mendel hall
+# TODO: wrap into an Estimator
+
+probability=True
+SVM = SVC
+# SVM = LinearSVC

 nfolds = 3
 params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
 if SVM is SVC:
-    params['kernel']=['linear','rbf']
+     params['kernel']=['linear','rbf']

 path = '../testi'

-Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
+reader = LoadDocuments(split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1, split_policy=split_by_endline, normalize_features=True)
+Xtr,ytr,ep1,ep2 = reader.load(path)

 # learn a SVM

-# svm = SVM(probability=probability)
-svm = SVM()
+svm = SVM(probability=probability)
+# svm = SVM()

 positive_examples = ytr.sum()
 if positive_examples>nfolds:
    print('optimizing {}'.format(svm.__class__.__name__))
-    svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
+    svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)

 svm.fit(Xtr, ytr)

--- a/src/disable_sklearn_warnings.py
+++ b/src/disable_sklearn_warnings.py
@ -0,0 +1,3 @@
+def warn(*args, **kwargs): pass
+import warnings
+warnings.warn = warn
--- a/src/doc_representation.py
+++ b/src/doc_representation.py
@ -1,3 +1,4 @@
+import disable_sklearn_warnings
 import nltk
 import numpy as np
 import os
@ -5,7 +6,9 @@ from os.path import join
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_selection import SelectKBest
 from sklearn.feature_selection import chi2
-from scipy.sparse import hstack, csr_matrix
+from sklearn.preprocessing import normalize
+from scipy.sparse import hstack, csr_matrix, issparse
+import matplotlib.pyplot as plt

 function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]

@ -40,11 +43,15 @@ def _load_texts(path):
 # split policies
 # ------------------------------------------------------------------------
 # TODO: implement other split policies (e.g., overlapping ones, etc)
-def _split_by_endline(text):
+def split_by_endline(text):
    return [t.strip() for t in text.split('\n') if t.strip()]


-def splitter(documents, authors=None, split_policy=_split_by_endline):
+def split_by_sentences(text):
+    pass
+
+
+def splitter(documents, authors=None, split_policy=split_by_endline):
    fragments = []
    authors_fragments = []
    for i, text in enumerate(documents):
@ -71,6 +78,7 @@ def _features_function_words_freq(documents):
    for text in documents:
        tokens = nltk.word_tokenize(text)
        author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
+        # author_tokens = ([token.lower() for token in tokens])
        freqs = nltk.FreqDist(author_tokens)

        nwords = len(author_tokens)
@ -96,6 +104,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None):

    return features, tfidf_vectorizer

+
 def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
    nF = X.shape[1]
    num_feats = int(tfidf_feat_selection_ratio * nF)
@ -105,84 +114,125 @@ def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
    EP2 = feature_selector.transform(EP2)
    return X,EP1,EP2

-def load_documents(path,
+
+def _features_mendel_hall(documents):
+    raise NotImplementedError('not yet implemented')
+    pass
+
+
+
+
+
+
+class LoadDocuments:
+    def __init__(self,
                   function_words_freq=True,
                   tfidf=False,
                   tfidf_feat_selection_ratio=1.,
+                   mendelhall=False,
                   split_documents=False,
-                   split_policy = _split_by_endline,
+                   split_policy = split_by_endline,
+                   normalize_features=True,
                   verbose=True):
-    """
-    Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
-    contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
-    EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
-    :param path: the path containing the texts, each named as <author>_<text_name>.txt
-    :param function_words_freq: add the frequency of function words as features
-    :param tfidf: add the tfidf as features
-    :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
-    Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
-    full documents, which are anyway retained).
-    :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
-    :param verbose: show information by stdout or not
-    :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
-    matrix of features for the training set and y are the labels (np.array);
-    EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
-    split_documents=True) and 2 (similar)
-    """
+        """
+        Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
+        contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
+        EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
+        :param path: the path containing the texts, each named as <author>_<text_name>.txt
+        :param function_words_freq: add the frequency of function words as features
+        :param tfidf: add the tfidf as features
+        :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
+        Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
+        full documents, which are anyway retained).
+        :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
+        :param verbose: show information by stdout or not
+        :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
+        matrix of features for the training set and y are the labels (np.array);
+        EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
+        split_documents=True) and 2 (similar)
+        """

-    documents, authors, ep1_text, ep2_text = _load_texts(path)
-    ep1,ep2 = [ep1_text],[ep2_text]
-    n_original_docs=len(documents)
+        self.normalize_features=normalize_features
+        self.split_documents = split_documents
+        self.split_policy = split_policy
+        self.function_words_freq=function_words_freq
+        self.mendelhall = mendelhall
+        self.tfidf = tfidf
+        self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
+        self.verbose = verbose

-    if split_documents:
-        doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
-        documents.extend(doc_fragments)
-        authors.extend(authors_fragments)
+    def load(self, path):
+        documents, authors, ep1_text, ep2_text = _load_texts(path)
+        ep1,ep2 = [ep1_text],[ep2_text]
+        n_original_docs=len(documents)

-        ep1.extend(splitter(ep1, split_policy=split_policy))
-        ep2.extend(splitter(ep2, split_policy=split_policy))
+        if self.split_documents:
+            doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy)
+            documents.extend(doc_fragments)
+            authors.extend(authors_fragments)
+
+            ep1.extend(splitter(ep1, split_policy=self.split_policy))
+            ep2.extend(splitter(ep2, split_policy=self.split_policy))
+
+        # represent the target vector
+        y = np.array([(1 if author == "Dante" else 0) for author in authors])
+
+        # initialize the document-by-feature vector
+        X = np.empty((len(documents), 0))
+        EP1 = np.empty((len(ep1), 0))
+        EP2 = np.empty((len(ep2), 0))
+
+        # dense feature extraction functions
+        if self.function_words_freq:
+            X = self.addfeatures(X,_features_function_words_freq(documents))
+            EP1 = self.addfeatures(EP1, _features_function_words_freq(ep1))
+            EP2 = self.addfeatures(EP2, _features_function_words_freq(ep2))
+
+        if self.mendelhall:
+            X = self.addfeatures(X, _features_mendel_hall(documents))
+            EP1 = self.addfeatures(EP1, _features_mendel_hall(ep1))
+            EP2 = self.addfeatures(EP2, _features_mendel_hall(ep2))
+
+        # sparse feature extraction functions
+        if self.tfidf:
+            X_features, vectorizer = _features_tfidf(documents)
+            ep1_features, _ = _features_tfidf(ep1, vectorizer)
+            ep2_features, _ = _features_tfidf(ep2, vectorizer)
+
+            if self.tfidf_feat_selection_ratio < 1.:
+                if self.verbose: print('feature selection')
+                X_features, ep1_features, ep2_features = \
+                    _feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
+
+            # matrix is sparse now
+            X   = self.addfeatures(csr_matrix(X), X_features)
+            EP1 = self.addfeatures(csr_matrix(EP1), ep1_features)
+            EP2 = self.addfeatures(csr_matrix(EP2), ep2_features)


-    # represent the target vector
-    y = np.array([(1 if author == "Dante" else 0) for author in authors])
+        # print summary
+        if self.verbose:
+            print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
+                  .format(self.function_words_freq, self.tfidf, self.split_documents, self.split_policy.__name__))
+            print('number of training (full) documents: {}'.format(n_original_docs))
+            print('X shape (#documents,#features): {}'.format(X.shape))
+            print('y prevalence: {:.2f}%'.format(y.mean()*100))
+            print('Epistola 1 shape:', EP1.shape)
+            print('Epistola 2 shape:', EP2.shape)
+            print()

-    # initialize the document-by-feature vector
-    X = np.empty((len(documents), 0))
-    EP1 = np.empty((len(ep1), 0))
-    EP2 = np.empty((len(ep2), 0))
+        return X, y, EP1, EP2

-    if function_words_freq:
-        X = np.hstack((X,_features_function_words_freq(documents)))
-        EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
-        EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
+    def addfeatures(self, X, F):
+        # plt.matshow(F[:25])
+        # plt.show()
+        if self.normalize_features:
+            normalize(F, axis=1, copy=False)

-    if tfidf:
-        X_features, vectorizer = _features_tfidf(documents)
-        ep1_features, _ = _features_tfidf(ep1, vectorizer)
-        ep2_features, _ = _features_tfidf(ep2, vectorizer)
+        if issparse(F):
+            return hstack((X, F))  # sparse
+        else:
+            return np.hstack((X, F))  # dense

-        if tfidf_feat_selection_ratio < 1.:
-            if verbose: print('feature selection')
-            X_features, ep1_features, ep2_features = \
-                _feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
-
-        # matrix is sparse now
-        X   = hstack((csr_matrix(X), X_features))
-        EP1 = hstack((csr_matrix(EP1), ep1_features))
-        EP2 = hstack((csr_matrix(EP2), ep2_features))
-
-
-    # print summary
-    if verbose:
-        print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
-              .format(function_words_freq, tfidf, split_documents, split_policy.__name__))
-        print('number of training (full) documents: {}'.format(n_original_docs))
-        print('X shape (#documents,#features): {}'.format(X.shape))
-        print('y prevalence: {:.2f}%'.format(y.mean()*100))
-        print('Epistola 1 shape:', EP1.shape)
-        print('Epistola 2 shape:', EP2.shape)
-        print()
-
-    return X, y, EP1, EP2
            
        
--- a/src/requisites.txt
+++ b/src/requisites.txt
@ -0,0 +1,3 @@
+skelearn >= 0.19.1
+scipy >= 1.0.0
+numpy >= 1.15.2