example code
This commit is contained in:
parent
e18de4e2dc
commit
634a8a099b
|
|
@ -0,0 +1,43 @@
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.model_selection import cross_val_score, GridSearchCV
|
||||||
|
from sklearn.feature_selection import SelectKBest, chi2
|
||||||
|
from doc_representation import *
|
||||||
|
|
||||||
|
nfolds = 2
|
||||||
|
do_feat_selection = True
|
||||||
|
params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
|
||||||
|
|
||||||
|
path = '/home/moreo/Dante/testi'
|
||||||
|
Xtr,ytr,ep1,ep2 = load_documents(path, by_sentences=True)
|
||||||
|
|
||||||
|
if do_feat_selection:
|
||||||
|
print('feature selection')
|
||||||
|
num_feats = int(0.1 * Xtr.shape[1])
|
||||||
|
feature_selector = SelectKBest(chi2, k=num_feats)
|
||||||
|
Xtr = feature_selector.fit_transform(Xtr,ytr)
|
||||||
|
print('final shape={}'.format(Xtr.shape))
|
||||||
|
ep1 = feature_selector.transform(ep1)
|
||||||
|
ep2 = feature_selector.transform(ep2)
|
||||||
|
|
||||||
|
|
||||||
|
# learn a SVM
|
||||||
|
print('optimizing a SVM')
|
||||||
|
svm_base = LinearSVC()
|
||||||
|
|
||||||
|
svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
|
||||||
|
svm_optimized.fit(Xtr,ytr)
|
||||||
|
print('Best params: {}'.format(svm_optimized.best_params_))
|
||||||
|
|
||||||
|
# evaluation of results
|
||||||
|
print('computng the cross-val score')
|
||||||
|
accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
|
||||||
|
acc_mean, acc_std = accuracies.mean(), accuracies.std()
|
||||||
|
print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
|
||||||
|
|
||||||
|
# final test
|
||||||
|
print('predicting the Epistolas')
|
||||||
|
ep1_ = svm_optimized.predict(ep1)
|
||||||
|
ep2_ = svm_optimized.predict(ep2)
|
||||||
|
print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
|
||||||
|
print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
|
||||||
|
|
||||||
|
|
@ -0,0 +1,50 @@
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def load_documents(path, by_sentences=False):
|
||||||
|
#read documents
|
||||||
|
docs,y = [],[]
|
||||||
|
for file in os.listdir(path):
|
||||||
|
if file.startswith('EpistolaXIII_'): continue
|
||||||
|
file_clean = file.replace('.txt','')
|
||||||
|
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
|
||||||
|
if by_sentences:
|
||||||
|
lines = open(join(path, file)).readlines()
|
||||||
|
docs.extend(lines)
|
||||||
|
if author == 'Dante':
|
||||||
|
y.extend([1] * len(lines))
|
||||||
|
else:
|
||||||
|
y.extend([0] * len(lines))
|
||||||
|
else:
|
||||||
|
docs.append(open(join(path,file)).read())
|
||||||
|
if author == 'Dante':
|
||||||
|
y.append(1)
|
||||||
|
else:
|
||||||
|
y.append(0)
|
||||||
|
|
||||||
|
if not by_sentences:
|
||||||
|
y = y + y
|
||||||
|
docs = docs + docs
|
||||||
|
|
||||||
|
if by_sentences:
|
||||||
|
ep1 = open(join(path, 'EpistolaXIII_1.txt')).readlines()
|
||||||
|
ep2 = open(join(path, 'EpistolaXIII_2.txt')).readlines()
|
||||||
|
else:
|
||||||
|
ep1 = [open(join(path, 'EpistolaXIII_1.txt' )).read()]
|
||||||
|
ep2 = [open(join(path, 'EpistolaXIII_2.txt')).read()]
|
||||||
|
|
||||||
|
# document representation
|
||||||
|
tfidf = TfidfVectorizer(sublinear_tf=True)
|
||||||
|
X = tfidf.fit_transform(docs)
|
||||||
|
y = np.array(y)
|
||||||
|
Epistola1 = tfidf.transform(ep1)
|
||||||
|
Epistola2 = tfidf.transform(ep2)
|
||||||
|
|
||||||
|
print('documents read, shape={}'.format(X.shape))
|
||||||
|
# print(y)
|
||||||
|
|
||||||
|
return X, y, Epistola1, Epistola2
|
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in New Issue