Experiment

2018-10-30 02:13:53 +01:00 · 2018-10-30 02:13:53 +01:00 · bd09d635f6
parent 634a8a099b
commit bd09d635f6
2 changed files with 96 additions and 0 deletions
--- a/src/classifier2.py
+++ b/src/classifier2.py
@ -0,0 +1,46 @@
+from sklearn.svm import *
+from sklearn.model_selection import cross_val_score, GridSearchCV
+from sklearn.feature_selection import SelectKBest, chi2
+from doc_representation2 import *
+import numpy as np
+
+nfolds = 2
+do_feat_selection = False
+params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
+
+path = 'Data'
+Xtr,ytr,ep2 = load_documents(path)
+
+if do_feat_selection:
+    print('feature selection')
+    num_feats = int(0.1 * Xtr.shape[1])
+    feature_selector = SelectKBest(chi2, k=num_feats)
+    Xtr = feature_selector.fit_transform(Xtr,ytr)
+    print('final shape={}'.format(Xtr.shape))
+        #ep1 = feature_selector.transform(ep1)
+    ep2 = feature_selector.transform(ep2)
+
+
+# learn a SVM
+print('optimizing a SVM')
+svm_base = LinearSVC(max_iter=-1) #i risultati non convergevano, cosi' ho provato SVC piuttosto che LinearSVC
+
+svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
+    #print ("Shape of X:", Xtr.shape)
+svm_optimized.fit(Xtr, ytr)
+    #print('Best params: {}'.format(svm_optimized.best_params_))
+
+# evaluation of results
+print('computing the cross-val score')
+accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
+acc_mean, acc_std = accuracies.mean(), accuracies.std()
+print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
+
+# final test
+print('predicting the Epistolas')
+    #ep1_ = svm_optimized.predict(ep1)
+ep2= np.reshape(ep2, (1,-1))
+ep2_ = svm_optimized.predict(ep2)
+    #print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
+print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
+
--- a/src/doc_representation2.py
+++ b/src/doc_representation2.py
@ -0,0 +1,50 @@
+import nltk
+import numpy as np
+import os
+from os.path import join
+
+function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
+
+def load_documents(path):
+    X, y = [], []
+    i=0;
+    for file in os.listdir(path):
+        if file.startswith('EpistolaXIII_'): continue
+        file_clean = file.replace('.txt','')
+        author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
+        tokens = nltk.word_tokenize(open(join(path,file), encoding= "utf8").read())
+        author_tokens = ([token.lower() for token in tokens
+                                  if any(char.isalpha() for char in token)])
+        freqs= nltk.FreqDist(author_tokens)
+        X.append([])
+        #print(f"From {textname} by {author}:")
+        for function_word in function_words:
+            feature= (freqs[function_word]*1000)/len(author_tokens)
+            #print(function_word, " = ", freqs[function_word], ", ", feature)
+            X[i].append(feature)
+        i+=1
+        if author == "Dante":
+            y.append(1)
+        else:
+            y.append(0)
+            
+    
+    y= y + y
+    X= X + X
+    y= np.array(y)
+    
+    ep = []
+    tokens = nltk.word_tokenize(open(join(path, 'EpistolaXIII_2.txt'), encoding= "utf8").read())
+    ep2_tokens = ([token.lower() for token in tokens
+                                  if any(char.isalpha() for char in token)])
+    freqs= nltk.FreqDist(ep2_tokens)
+    #print("From Epistola XIII_2:")
+    for function_word in function_words:
+        feature= (freqs[function_word]*1000/len(ep2_tokens))
+        ep.append(feature)
+        #print(function_word, " = ", freqs[function_word], ", ", feature)
+        ep2 = np.array(ep)
+
+    return X, y, ep2
+            
+