From bd09d635f6a9f05c9cbda0afa19d0f28cb43a57d Mon Sep 17 00:00:00 2001 From: Silvia Corbara Date: Tue, 30 Oct 2018 02:13:53 +0100 Subject: [PATCH] Experiment --- src/classifier2.py | 46 +++++++++++++++++++++++++++++++++++ src/doc_representation2.py | 50 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 src/classifier2.py create mode 100644 src/doc_representation2.py diff --git a/src/classifier2.py b/src/classifier2.py new file mode 100644 index 0000000..6a20a96 --- /dev/null +++ b/src/classifier2.py @@ -0,0 +1,46 @@ +from sklearn.svm import * +from sklearn.model_selection import cross_val_score, GridSearchCV +from sklearn.feature_selection import SelectKBest, chi2 +from doc_representation2 import * +import numpy as np + +nfolds = 2 +do_feat_selection = False +params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}] + +path = 'Data' +Xtr,ytr,ep2 = load_documents(path) + +if do_feat_selection: + print('feature selection') + num_feats = int(0.1 * Xtr.shape[1]) + feature_selector = SelectKBest(chi2, k=num_feats) + Xtr = feature_selector.fit_transform(Xtr,ytr) + print('final shape={}'.format(Xtr.shape)) + #ep1 = feature_selector.transform(ep1) + ep2 = feature_selector.transform(ep2) + + +# learn a SVM +print('optimizing a SVM') +svm_base = LinearSVC(max_iter=-1) #i risultati non convergevano, cosi' ho provato SVC piuttosto che LinearSVC + +svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds) + #print ("Shape of X:", Xtr.shape) +svm_optimized.fit(Xtr, ytr) + #print('Best params: {}'.format(svm_optimized.best_params_)) + +# evaluation of results +print('computing the cross-val score') +accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1) +acc_mean, acc_std = accuracies.mean(), accuracies.std() +print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std)) + +# final test +print('predicting the Epistolas') + #ep1_ = svm_optimized.predict(ep1) +ep2= np.reshape(ep2, (1,-1)) +ep2_ = svm_optimized.predict(ep2) + #print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_)) +print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_)) + diff --git a/src/doc_representation2.py b/src/doc_representation2.py new file mode 100644 index 0000000..9ac77ca --- /dev/null +++ b/src/doc_representation2.py @@ -0,0 +1,50 @@ +import nltk +import numpy as np +import os +from os.path import join + +function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"] + +def load_documents(path): + X, y = [], [] + i=0; + for file in os.listdir(path): + if file.startswith('EpistolaXIII_'): continue + file_clean = file.replace('.txt','') + author, textname = file_clean.split('_')[0],file_clean.split('_')[1] + tokens = nltk.word_tokenize(open(join(path,file), encoding= "utf8").read()) + author_tokens = ([token.lower() for token in tokens + if any(char.isalpha() for char in token)]) + freqs= nltk.FreqDist(author_tokens) + X.append([]) + #print(f"From {textname} by {author}:") + for function_word in function_words: + feature= (freqs[function_word]*1000)/len(author_tokens) + #print(function_word, " = ", freqs[function_word], ", ", feature) + X[i].append(feature) + i+=1 + if author == "Dante": + y.append(1) + else: + y.append(0) + + + y= y + y + X= X + X + y= np.array(y) + + ep = [] + tokens = nltk.word_tokenize(open(join(path, 'EpistolaXIII_2.txt'), encoding= "utf8").read()) + ep2_tokens = ([token.lower() for token in tokens + if any(char.isalpha() for char in token)]) + freqs= nltk.FreqDist(ep2_tokens) + #print("From Epistola XIII_2:") + for function_word in function_words: + feature= (freqs[function_word]*1000/len(ep2_tokens)) + ep.append(feature) + #print(function_word, " = ", freqs[function_word], ", ", feature) + ep2 = np.array(ep) + + return X, y, ep2 + +