Experiment
This commit is contained in:
parent
634a8a099b
commit
bd09d635f6
|
|
@ -0,0 +1,46 @@
|
|||
from sklearn.svm import *
|
||||
from sklearn.model_selection import cross_val_score, GridSearchCV
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from doc_representation2 import *
|
||||
import numpy as np
|
||||
|
||||
nfolds = 2
|
||||
do_feat_selection = False
|
||||
params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
|
||||
|
||||
path = 'Data'
|
||||
Xtr,ytr,ep2 = load_documents(path)
|
||||
|
||||
if do_feat_selection:
|
||||
print('feature selection')
|
||||
num_feats = int(0.1 * Xtr.shape[1])
|
||||
feature_selector = SelectKBest(chi2, k=num_feats)
|
||||
Xtr = feature_selector.fit_transform(Xtr,ytr)
|
||||
print('final shape={}'.format(Xtr.shape))
|
||||
#ep1 = feature_selector.transform(ep1)
|
||||
ep2 = feature_selector.transform(ep2)
|
||||
|
||||
|
||||
# learn a SVM
|
||||
print('optimizing a SVM')
|
||||
svm_base = LinearSVC(max_iter=-1) #i risultati non convergevano, cosi' ho provato SVC piuttosto che LinearSVC
|
||||
|
||||
svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
|
||||
#print ("Shape of X:", Xtr.shape)
|
||||
svm_optimized.fit(Xtr, ytr)
|
||||
#print('Best params: {}'.format(svm_optimized.best_params_))
|
||||
|
||||
# evaluation of results
|
||||
print('computing the cross-val score')
|
||||
accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
|
||||
acc_mean, acc_std = accuracies.mean(), accuracies.std()
|
||||
print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
|
||||
|
||||
# final test
|
||||
print('predicting the Epistolas')
|
||||
#ep1_ = svm_optimized.predict(ep1)
|
||||
ep2= np.reshape(ep2, (1,-1))
|
||||
ep2_ = svm_optimized.predict(ep2)
|
||||
#print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
|
||||
print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
|
||||
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
import nltk
|
||||
import numpy as np
|
||||
import os
|
||||
from os.path import join
|
||||
|
||||
function_words = ["et", "in", "de", "ad", "ut", "cum", "non", "per", "a", "que", "ex", "sed"]
|
||||
|
||||
def load_documents(path):
|
||||
X, y = [], []
|
||||
i=0;
|
||||
for file in os.listdir(path):
|
||||
if file.startswith('EpistolaXIII_'): continue
|
||||
file_clean = file.replace('.txt','')
|
||||
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
|
||||
tokens = nltk.word_tokenize(open(join(path,file), encoding= "utf8").read())
|
||||
author_tokens = ([token.lower() for token in tokens
|
||||
if any(char.isalpha() for char in token)])
|
||||
freqs= nltk.FreqDist(author_tokens)
|
||||
X.append([])
|
||||
#print(f"From {textname} by {author}:")
|
||||
for function_word in function_words:
|
||||
feature= (freqs[function_word]*1000)/len(author_tokens)
|
||||
#print(function_word, " = ", freqs[function_word], ", ", feature)
|
||||
X[i].append(feature)
|
||||
i+=1
|
||||
if author == "Dante":
|
||||
y.append(1)
|
||||
else:
|
||||
y.append(0)
|
||||
|
||||
|
||||
y= y + y
|
||||
X= X + X
|
||||
y= np.array(y)
|
||||
|
||||
ep = []
|
||||
tokens = nltk.word_tokenize(open(join(path, 'EpistolaXIII_2.txt'), encoding= "utf8").read())
|
||||
ep2_tokens = ([token.lower() for token in tokens
|
||||
if any(char.isalpha() for char in token)])
|
||||
freqs= nltk.FreqDist(ep2_tokens)
|
||||
#print("From Epistola XIII_2:")
|
||||
for function_word in function_words:
|
||||
feature= (freqs[function_word]*1000/len(ep2_tokens))
|
||||
ep.append(feature)
|
||||
#print(function_word, " = ", freqs[function_word], ", ", feature)
|
||||
ep2 = np.array(ep)
|
||||
|
||||
return X, y, ep2
|
||||
|
||||
|
||||
Loading…
Reference in New Issue