update

2018-11-02 17:08:32 +01:00 · 2018-11-02 17:08:32 +01:00 · 80956499d0
parent 451dfd544d
commit 80956499d0
4 changed files with 140 additions and 77 deletions
--- a/src/classifier.py
+++ b/src/classifier.py
@ -1,30 +1,37 @@
 from sklearn.svm import *
 from sklearn.model_selection import cross_val_score, GridSearchCV
 from doc_representation import *
 from sklearn.metrics import f1_score, make_scorer
 from doc_representation import *
-probability=False
+# TODO: add function words
-# SVM = SVC
+# TODO: other split policies
-SVM = LinearSVC
+# TODO: understand normalization
 # TODO: mendel hall
 # TODO: wrap into an Estimator
 probability=True
 SVM = SVC
 # SVM = LinearSVC
 nfolds = 3
 params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
 if SVM is SVC:
-    params['kernel']=['linear','rbf']
+     params['kernel']=['linear','rbf']
 path = '../testi'
-Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
+reader = LoadDocuments(split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1, split_policy=split_by_endline, normalize_features=True)
 Xtr,ytr,ep1,ep2 = reader.load(path)
 # learn a SVM
-# svm = SVM(probability=probability)
+svm = SVM(probability=probability)
-svm = SVM()
+# svm = SVM()
 positive_examples = ytr.sum()
 if positive_examples>nfolds:
    print('optimizing {}'.format(svm.__class__.__name__))
-    svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
+    svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
 svm.fit(Xtr, ytr)
--- a/src/disable_sklearn_warnings.py
+++ b/src/disable_sklearn_warnings.py
@ -0,0 +1,3 @@
 def warn(*args, **kwargs): pass
 import warnings
 warnings.warn = warn
--- a/src/doc_representation.py
+++ b/src/doc_representation.py
@ -1,3 +1,4 @@
 import disable_sklearn_warnings
 import nltk
 import numpy as np
 import os
@ -5,7 +6,9 @@ from os.path import join
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_selection import SelectKBest
 from sklearn.feature_selection import chi2
-from scipy.sparse import hstack, csr_matrix
+from sklearn.preprocessing import normalize
 from scipy.sparse import hstack, csr_matrix, issparse
 import matplotlib.pyplot as plt
 function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
@ -40,11 +43,15 @@ def _load_texts(path):
 # split policies
 # ------------------------------------------------------------------------
 # TODO: implement other split policies (e.g., overlapping ones, etc)
-def _split_by_endline(text):
+def split_by_endline(text):
    return [t.strip() for t in text.split('\n') if t.strip()]
-def splitter(documents, authors=None, split_policy=_split_by_endline):
+def split_by_sentences(text):
    pass
 def splitter(documents, authors=None, split_policy=split_by_endline):
    fragments = []
    authors_fragments = []
    for i, text in enumerate(documents):
@ -71,6 +78,7 @@ def _features_function_words_freq(documents):
    for text in documents:
        tokens = nltk.word_tokenize(text)
        author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
        # author_tokens = ([token.lower() for token in tokens])
        freqs = nltk.FreqDist(author_tokens)
        nwords = len(author_tokens)
@ -96,6 +104,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None):
    return features, tfidf_vectorizer
 def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
    nF = X.shape[1]
    num_feats = int(tfidf_feat_selection_ratio * nF)
@ -105,84 +114,125 @@ def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
    EP2 = feature_selector.transform(EP2)
    return X,EP1,EP2
-def load_documents(path,
+
 def _features_mendel_hall(documents):
    raise NotImplementedError('not yet implemented')
    pass
 class LoadDocuments:
    def __init__(self,
                   function_words_freq=True,
                   tfidf=False,
                   tfidf_feat_selection_ratio=1.,
                   mendelhall=False,
                   split_documents=False,
-                   split_policy = _split_by_endline,
+                   split_policy = split_by_endline,
                   normalize_features=True,
                   verbose=True):
-    """
+        """
-    Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
+        Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
-    contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
+        contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
-    EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
+        EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
-    :param path: the path containing the texts, each named as <author>_<text_name>.txt
+        :param path: the path containing the texts, each named as <author>_<text_name>.txt
-    :param function_words_freq: add the frequency of function words as features
+        :param function_words_freq: add the frequency of function words as features
-    :param tfidf: add the tfidf as features
+        :param tfidf: add the tfidf as features
-    :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
+        :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
-    Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
+        Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
-    full documents, which are anyway retained).
+        full documents, which are anyway retained).
-    :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
+        :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
-    :param verbose: show information by stdout or not
+        :param verbose: show information by stdout or not
-    :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
+        :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
-    matrix of features for the training set and y are the labels (np.array);
+        matrix of features for the training set and y are the labels (np.array);
-    EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
+        EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
-    split_documents=True) and 2 (similar)
+        split_documents=True) and 2 (similar)
-    """
+        """
-    documents, authors, ep1_text, ep2_text = _load_texts(path)
+        self.normalize_features=normalize_features
-    ep1,ep2 = [ep1_text],[ep2_text]
+        self.split_documents = split_documents
-    n_original_docs=len(documents)
+        self.split_policy = split_policy
        self.function_words_freq=function_words_freq
        self.mendelhall = mendelhall
        self.tfidf = tfidf
        self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
        self.verbose = verbose
-    if split_documents:
+    def load(self, path):
-        doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
+        documents, authors, ep1_text, ep2_text = _load_texts(path)
-        documents.extend(doc_fragments)
+        ep1,ep2 = [ep1_text],[ep2_text]
-        authors.extend(authors_fragments)
+        n_original_docs=len(documents)
-        ep1.extend(splitter(ep1, split_policy=split_policy))
+        if self.split_documents:
-        ep2.extend(splitter(ep2, split_policy=split_policy))
+            doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy)
            documents.extend(doc_fragments)
            authors.extend(authors_fragments)
            ep1.extend(splitter(ep1, split_policy=self.split_policy))
            ep2.extend(splitter(ep2, split_policy=self.split_policy))
        # represent the target vector
        y = np.array([(1 if author == "Dante" else 0) for author in authors])
        # initialize the document-by-feature vector
        X = np.empty((len(documents), 0))
        EP1 = np.empty((len(ep1), 0))
        EP2 = np.empty((len(ep2), 0))
        # dense feature extraction functions
        if self.function_words_freq:
            X = self.addfeatures(X,_features_function_words_freq(documents))
            EP1 = self.addfeatures(EP1, _features_function_words_freq(ep1))
            EP2 = self.addfeatures(EP2, _features_function_words_freq(ep2))
        if self.mendelhall:
            X = self.addfeatures(X, _features_mendel_hall(documents))
            EP1 = self.addfeatures(EP1, _features_mendel_hall(ep1))
            EP2 = self.addfeatures(EP2, _features_mendel_hall(ep2))
        # sparse feature extraction functions
        if self.tfidf:
            X_features, vectorizer = _features_tfidf(documents)
            ep1_features, _ = _features_tfidf(ep1, vectorizer)
            ep2_features, _ = _features_tfidf(ep2, vectorizer)
            if self.tfidf_feat_selection_ratio < 1.:
                if self.verbose: print('feature selection')
                X_features, ep1_features, ep2_features = \
                    _feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
            # matrix is sparse now
            X   = self.addfeatures(csr_matrix(X), X_features)
            EP1 = self.addfeatures(csr_matrix(EP1), ep1_features)
            EP2 = self.addfeatures(csr_matrix(EP2), ep2_features)
-    # represent the target vector
+        # print summary
-    y = np.array([(1 if author == "Dante" else 0) for author in authors])
+        if self.verbose:
            print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
                  .format(self.function_words_freq, self.tfidf, self.split_documents, self.split_policy.__name__))
            print('number of training (full) documents: {}'.format(n_original_docs))
            print('X shape (#documents,#features): {}'.format(X.shape))
            print('y prevalence: {:.2f}%'.format(y.mean()*100))
            print('Epistola 1 shape:', EP1.shape)
            print('Epistola 2 shape:', EP2.shape)
            print()
-    # initialize the document-by-feature vector
+        return X, y, EP1, EP2
    X = np.empty((len(documents), 0))
    EP1 = np.empty((len(ep1), 0))
    EP2 = np.empty((len(ep2), 0))
-    if function_words_freq:
+    def addfeatures(self, X, F):
-        X = np.hstack((X,_features_function_words_freq(documents)))
+        # plt.matshow(F[:25])
-        EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
+        # plt.show()
-        EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
+        if self.normalize_features:
            normalize(F, axis=1, copy=False)
-    if tfidf:
+        if issparse(F):
-        X_features, vectorizer = _features_tfidf(documents)
+            return hstack((X, F))  # sparse
-        ep1_features, _ = _features_tfidf(ep1, vectorizer)
+        else:
-        ep2_features, _ = _features_tfidf(ep2, vectorizer)
+            return np.hstack((X, F))  # dense
        if tfidf_feat_selection_ratio < 1.:
            if verbose: print('feature selection')
            X_features, ep1_features, ep2_features = \
                _feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
        # matrix is sparse now
        X   = hstack((csr_matrix(X), X_features))
        EP1 = hstack((csr_matrix(EP1), ep1_features))
        EP2 = hstack((csr_matrix(EP2), ep2_features))
    # print summary
    if verbose:
        print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
              .format(function_words_freq, tfidf, split_documents, split_policy.__name__))
        print('number of training (full) documents: {}'.format(n_original_docs))
        print('X shape (#documents,#features): {}'.format(X.shape))
        print('y prevalence: {:.2f}%'.format(y.mean()*100))
        print('Epistola 1 shape:', EP1.shape)
        print('Epistola 2 shape:', EP2.shape)
        print()
    return X, y, EP1, EP2
--- a/src/requisites.txt
+++ b/src/requisites.txt
@ -0,0 +1,3 @@
 skelearn >= 0.19.1
 scipy >= 1.0.0
 numpy >= 1.15.2