diff --git a/src/classifier.py b/src/classifier.py
index e4b00e2..c23fc08 100644
--- a/src/classifier.py
+++ b/src/classifier.py
@@ -1,30 +1,37 @@
 from sklearn.svm import *
 from sklearn.model_selection import cross_val_score, GridSearchCV
-from doc_representation import *
 from sklearn.metrics import f1_score, make_scorer
+from doc_representation import *
 
-probability=False
-# SVM = SVC
-SVM = LinearSVC
+# TODO: add function words
+# TODO: other split policies
+# TODO: understand normalization
+# TODO: mendel hall
+# TODO: wrap into an Estimator
+
+probability=True
+SVM = SVC
+# SVM = LinearSVC
 
 nfolds = 3
 params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced',None]}
 if SVM is SVC:
-    params['kernel']=['linear','rbf']
+     params['kernel']=['linear','rbf']
 
 path = '../testi'
 
-Xtr,ytr,ep1,ep2 = load_documents(path, split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1)
+reader = LoadDocuments(split_documents=True, function_words_freq=True, tfidf=True, tfidf_feat_selection_ratio=0.1, split_policy=split_by_endline, normalize_features=True)
+Xtr,ytr,ep1,ep2 = reader.load(path)
 
 # learn a SVM
 
-# svm = SVM(probability=probability)
-svm = SVM()
+svm = SVM(probability=probability)
+# svm = SVM()
 
 positive_examples = ytr.sum()
 if positive_examples>nfolds:
     print('optimizing {}'.format(svm.__class__.__name__))
-    svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score))
+    svm = GridSearchCV(svm, param_grid=params, cv=nfolds, scoring=make_scorer(f1_score), n_jobs=-1)
 
 svm.fit(Xtr, ytr)
 
diff --git a/src/disable_sklearn_warnings.py b/src/disable_sklearn_warnings.py
new file mode 100644
index 0000000..e669983
--- /dev/null
+++ b/src/disable_sklearn_warnings.py
@@ -0,0 +1,3 @@
+def warn(*args, **kwargs): pass
+import warnings
+warnings.warn = warn
diff --git a/src/doc_representation.py b/src/doc_representation.py
index d194d95..abaaf2e 100644
--- a/src/doc_representation.py
+++ b/src/doc_representation.py
@@ -1,3 +1,4 @@
+import disable_sklearn_warnings
 import nltk
 import numpy as np
 import os
@@ -5,7 +6,9 @@ from os.path import join
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_selection import SelectKBest
 from sklearn.feature_selection import chi2
-from scipy.sparse import hstack, csr_matrix
+from sklearn.preprocessing import normalize
+from scipy.sparse import hstack, csr_matrix, issparse
+import matplotlib.pyplot as plt
 
 function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
 
@@ -40,11 +43,15 @@ def _load_texts(path):
 # split policies
 # ------------------------------------------------------------------------
 # TODO: implement other split policies (e.g., overlapping ones, etc)
-def _split_by_endline(text):
+def split_by_endline(text):
     return [t.strip() for t in text.split('\n') if t.strip()]
 
 
-def splitter(documents, authors=None, split_policy=_split_by_endline):
+def split_by_sentences(text):
+    pass
+
+
+def splitter(documents, authors=None, split_policy=split_by_endline):
     fragments = []
     authors_fragments = []
     for i, text in enumerate(documents):
@@ -71,6 +78,7 @@ def _features_function_words_freq(documents):
     for text in documents:
         tokens = nltk.word_tokenize(text)
         author_tokens = ([token.lower() for token in tokens if any(char.isalpha() for char in token)])
+        # author_tokens = ([token.lower() for token in tokens])
         freqs = nltk.FreqDist(author_tokens)
 
         nwords = len(author_tokens)
@@ -96,6 +104,7 @@ def _features_tfidf(documents, tfidf_vectorizer=None):
 
     return features, tfidf_vectorizer
 
+
 def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
     nF = X.shape[1]
     num_feats = int(tfidf_feat_selection_ratio * nF)
@@ -105,84 +114,125 @@ def _feature_selection(X, y, EP1, EP2, tfidf_feat_selection_ratio):
     EP2 = feature_selector.transform(EP2)
     return X,EP1,EP2
 
-def load_documents(path,
+
+def _features_mendel_hall(documents):
+    raise NotImplementedError('not yet implemented')
+    pass
+
+
+
+
+
+
+class LoadDocuments:
+    def __init__(self,
                    function_words_freq=True,
                    tfidf=False,
                    tfidf_feat_selection_ratio=1.,
+                   mendelhall=False,
                    split_documents=False,
-                   split_policy = _split_by_endline,
+                   split_policy = split_by_endline,
+                   normalize_features=True,
                    verbose=True):
-    """
-    Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
-    contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
-    EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
-    :param path: the path containing the texts, each named as <author>_<text_name>.txt
-    :param function_words_freq: add the frequency of function words as features
-    :param tfidf: add the tfidf as features
-    :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
-    Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
-    full documents, which are anyway retained).
-    :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
-    :param verbose: show information by stdout or not
-    :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
-    matrix of features for the training set and y are the labels (np.array);
-    EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
-    split_documents=True) and 2 (similar)
-    """
+        """
+        Loads the documents contained in path applying a number of feature extraction policies. The directory is assumed to
+        contain files named according to <author>_<text_name>.txt plus two special files EpistolaXIII_1.txt and
+        EpistolaXIII_2.txt concerning the two documents whose authorship attribution is to be determined.
+        :param path: the path containing the texts, each named as <author>_<text_name>.txt
+        :param function_words_freq: add the frequency of function words as features
+        :param tfidf: add the tfidf as features
+        :param split_documents: whether to split text into smaller documents or not (currenty, the policy is to split by '\n').
+        Currently, the fragments resulting from the split are added to the pool of documents (i.e., they do not replace the
+        full documents, which are anyway retained).
+        :param split_policy: a callable that implements the split to be applied (ignored if split_documents=False)
+        :param verbose: show information by stdout or not
+        :return: np.arrays or csr_matrix (depending on whether tfidf is activated or not) X, y, EP1, EP2, where X is the
+        matrix of features for the training set and y are the labels (np.array);
+        EP1 and EP2 are the matrix of features for the epistola 1 (first row) and fragments (from row 2nd to last) if
+        split_documents=True) and 2 (similar)
+        """
 
-    documents, authors, ep1_text, ep2_text = _load_texts(path)
-    ep1,ep2 = [ep1_text],[ep2_text]
-    n_original_docs=len(documents)
+        self.normalize_features=normalize_features
+        self.split_documents = split_documents
+        self.split_policy = split_policy
+        self.function_words_freq=function_words_freq
+        self.mendelhall = mendelhall
+        self.tfidf = tfidf
+        self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
+        self.verbose = verbose
 
-    if split_documents:
-        doc_fragments, authors_fragments = splitter(documents, authors, split_policy=split_policy)
-        documents.extend(doc_fragments)
-        authors.extend(authors_fragments)
+    def load(self, path):
+        documents, authors, ep1_text, ep2_text = _load_texts(path)
+        ep1,ep2 = [ep1_text],[ep2_text]
+        n_original_docs=len(documents)
 
-        ep1.extend(splitter(ep1, split_policy=split_policy))
-        ep2.extend(splitter(ep2, split_policy=split_policy))
+        if self.split_documents:
+            doc_fragments, authors_fragments = splitter(documents, authors, split_policy=self.split_policy)
+            documents.extend(doc_fragments)
+            authors.extend(authors_fragments)
+
+            ep1.extend(splitter(ep1, split_policy=self.split_policy))
+            ep2.extend(splitter(ep2, split_policy=self.split_policy))
+
+        # represent the target vector
+        y = np.array([(1 if author == "Dante" else 0) for author in authors])
+
+        # initialize the document-by-feature vector
+        X = np.empty((len(documents), 0))
+        EP1 = np.empty((len(ep1), 0))
+        EP2 = np.empty((len(ep2), 0))
+
+        # dense feature extraction functions
+        if self.function_words_freq:
+            X = self.addfeatures(X,_features_function_words_freq(documents))
+            EP1 = self.addfeatures(EP1, _features_function_words_freq(ep1))
+            EP2 = self.addfeatures(EP2, _features_function_words_freq(ep2))
+
+        if self.mendelhall:
+            X = self.addfeatures(X, _features_mendel_hall(documents))
+            EP1 = self.addfeatures(EP1, _features_mendel_hall(ep1))
+            EP2 = self.addfeatures(EP2, _features_mendel_hall(ep2))
+
+        # sparse feature extraction functions
+        if self.tfidf:
+            X_features, vectorizer = _features_tfidf(documents)
+            ep1_features, _ = _features_tfidf(ep1, vectorizer)
+            ep2_features, _ = _features_tfidf(ep2, vectorizer)
+
+            if self.tfidf_feat_selection_ratio < 1.:
+                if self.verbose: print('feature selection')
+                X_features, ep1_features, ep2_features = \
+                    _feature_selection(X_features, y, ep1_features, ep2_features, self.tfidf_feat_selection_ratio)
+
+            # matrix is sparse now
+            X   = self.addfeatures(csr_matrix(X), X_features)
+            EP1 = self.addfeatures(csr_matrix(EP1), ep1_features)
+            EP2 = self.addfeatures(csr_matrix(EP2), ep2_features)
 
 
-    # represent the target vector
-    y = np.array([(1 if author == "Dante" else 0) for author in authors])
+        # print summary
+        if self.verbose:
+            print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
+                  .format(self.function_words_freq, self.tfidf, self.split_documents, self.split_policy.__name__))
+            print('number of training (full) documents: {}'.format(n_original_docs))
+            print('X shape (#documents,#features): {}'.format(X.shape))
+            print('y prevalence: {:.2f}%'.format(y.mean()*100))
+            print('Epistola 1 shape:', EP1.shape)
+            print('Epistola 2 shape:', EP2.shape)
+            print()
 
-    # initialize the document-by-feature vector
-    X = np.empty((len(documents), 0))
-    EP1 = np.empty((len(ep1), 0))
-    EP2 = np.empty((len(ep2), 0))
+        return X, y, EP1, EP2
 
-    if function_words_freq:
-        X = np.hstack((X,_features_function_words_freq(documents)))
-        EP1 = np.hstack((EP1, _features_function_words_freq(ep1)))
-        EP2 = np.hstack((EP2, _features_function_words_freq(ep2)))
+    def addfeatures(self, X, F):
+        # plt.matshow(F[:25])
+        # plt.show()
+        if self.normalize_features:
+            normalize(F, axis=1, copy=False)
 
-    if tfidf:
-        X_features, vectorizer = _features_tfidf(documents)
-        ep1_features, _ = _features_tfidf(ep1, vectorizer)
-        ep2_features, _ = _features_tfidf(ep2, vectorizer)
+        if issparse(F):
+            return hstack((X, F))  # sparse
+        else:
+            return np.hstack((X, F))  # dense
 
-        if tfidf_feat_selection_ratio < 1.:
-            if verbose: print('feature selection')
-            X_features, ep1_features, ep2_features = \
-                _feature_selection(X_features, y, ep1_features, ep2_features, tfidf_feat_selection_ratio)
-
-        # matrix is sparse now
-        X   = hstack((csr_matrix(X), X_features))
-        EP1 = hstack((csr_matrix(EP1), ep1_features))
-        EP2 = hstack((csr_matrix(EP2), ep2_features))
-
-
-    # print summary
-    if verbose:
-        print('load_documents: function_words_freq={} tfidf={}, split_documents={}, split_policy={}'
-              .format(function_words_freq, tfidf, split_documents, split_policy.__name__))
-        print('number of training (full) documents: {}'.format(n_original_docs))
-        print('X shape (#documents,#features): {}'.format(X.shape))
-        print('y prevalence: {:.2f}%'.format(y.mean()*100))
-        print('Epistola 1 shape:', EP1.shape)
-        print('Epistola 2 shape:', EP2.shape)
-        print()
-
-    return X, y, EP1, EP2
             
         
diff --git a/src/requisites.txt b/src/requisites.txt
new file mode 100644
index 0000000..1793191
--- /dev/null
+++ b/src/requisites.txt
@@ -0,0 +1,3 @@
+skelearn >= 0.19.1
+scipy >= 1.0.0
+numpy >= 1.15.2