From 634a8a099b58cf37ca2eecc3bc3c80e8dc0b656f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Moreo=20Fern=C3=A1ndez?=
 <alejandro.moreo@isti.cnr.it>
Date: Mon, 15 Oct 2018 14:26:39 +0200
Subject: [PATCH] example code

---
 src/classifier.py         | 43 +++++++++++++++++++++++++++++++++
 src/doc_representation.py | 50 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 src/classifier.py
 create mode 100644 src/doc_representation.py

diff --git a/src/classifier.py b/src/classifier.py
new file mode 100644
index 0000000..874c02e
--- /dev/null
+++ b/src/classifier.py
@@ -0,0 +1,43 @@
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import cross_val_score, GridSearchCV
+from sklearn.feature_selection import SelectKBest, chi2
+from doc_representation import *
+
+nfolds = 2
+do_feat_selection = True
+params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
+
+path = '/home/moreo/Dante/testi'
+Xtr,ytr,ep1,ep2 = load_documents(path, by_sentences=True)
+
+if do_feat_selection:
+    print('feature selection')
+    num_feats = int(0.1 * Xtr.shape[1])
+    feature_selector = SelectKBest(chi2, k=num_feats)
+    Xtr = feature_selector.fit_transform(Xtr,ytr)
+    print('final shape={}'.format(Xtr.shape))
+    ep1 = feature_selector.transform(ep1)
+    ep2 = feature_selector.transform(ep2)
+
+
+# learn a SVM
+print('optimizing a SVM')
+svm_base = LinearSVC()
+
+svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
+svm_optimized.fit(Xtr,ytr)
+print('Best params: {}'.format(svm_optimized.best_params_))
+
+# evaluation of results
+print('computng the cross-val score')
+accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
+acc_mean, acc_std = accuracies.mean(), accuracies.std()
+print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
+
+# final test
+print('predicting the Epistolas')
+ep1_ = svm_optimized.predict(ep1)
+ep2_ = svm_optimized.predict(ep2)
+print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
+print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
+
diff --git a/src/doc_representation.py b/src/doc_representation.py
new file mode 100644
index 0000000..9e45ebf
--- /dev/null
+++ b/src/doc_representation.py
@@ -0,0 +1,50 @@
+import os
+from os.path import join
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+
+def load_documents(path, by_sentences=False):
+    #read documents
+    docs,y = [],[]
+    for file in os.listdir(path):
+        if file.startswith('EpistolaXIII_'): continue
+        file_clean = file.replace('.txt','')
+        author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
+        if by_sentences:
+            lines = open(join(path, file)).readlines()
+            docs.extend(lines)
+            if author == 'Dante':
+                y.extend([1] * len(lines))
+            else:
+                y.extend([0] * len(lines))
+        else:
+            docs.append(open(join(path,file)).read())
+            if author == 'Dante':
+                y.append(1)
+            else:
+                y.append(0)
+
+    if not by_sentences:
+        y = y + y
+        docs = docs + docs
+
+    if by_sentences:
+        ep1 = open(join(path, 'EpistolaXIII_1.txt')).readlines()
+        ep2 = open(join(path, 'EpistolaXIII_2.txt')).readlines()
+    else:
+        ep1 = [open(join(path, 'EpistolaXIII_1.txt' )).read()]
+        ep2 = [open(join(path, 'EpistolaXIII_2.txt')).read()]
+
+    # document representation
+    tfidf = TfidfVectorizer(sublinear_tf=True)
+    X = tfidf.fit_transform(docs)
+    y = np.array(y)
+    Epistola1 = tfidf.transform(ep1)
+    Epistola2 = tfidf.transform(ep2)
+
+    print('documents read, shape={}'.format(X.shape))
+    # print(y)
+
+    return X, y, Epistola1, Epistola2
+
+