From 634a8a099b58cf37ca2eecc3bc3c80e8dc0b656f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Moreo=20Fern=C3=A1ndez?= Date: Mon, 15 Oct 2018 14:26:39 +0200 Subject: [PATCH] example code --- src/classifier.py | 43 +++++++++++++++++++++++++++++++++ src/doc_representation.py | 50 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 src/classifier.py create mode 100644 src/doc_representation.py diff --git a/src/classifier.py b/src/classifier.py new file mode 100644 index 0000000..874c02e --- /dev/null +++ b/src/classifier.py @@ -0,0 +1,43 @@ +from sklearn.svm import LinearSVC +from sklearn.model_selection import cross_val_score, GridSearchCV +from sklearn.feature_selection import SelectKBest, chi2 +from doc_representation import * + +nfolds = 2 +do_feat_selection = True +params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}] + +path = '/home/moreo/Dante/testi' +Xtr,ytr,ep1,ep2 = load_documents(path, by_sentences=True) + +if do_feat_selection: + print('feature selection') + num_feats = int(0.1 * Xtr.shape[1]) + feature_selector = SelectKBest(chi2, k=num_feats) + Xtr = feature_selector.fit_transform(Xtr,ytr) + print('final shape={}'.format(Xtr.shape)) + ep1 = feature_selector.transform(ep1) + ep2 = feature_selector.transform(ep2) + + +# learn a SVM +print('optimizing a SVM') +svm_base = LinearSVC() + +svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds) +svm_optimized.fit(Xtr,ytr) +print('Best params: {}'.format(svm_optimized.best_params_)) + +# evaluation of results +print('computng the cross-val score') +accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1) +acc_mean, acc_std = accuracies.mean(), accuracies.std() +print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std)) + +# final test +print('predicting the Epistolas') +ep1_ = svm_optimized.predict(ep1) +ep2_ = svm_optimized.predict(ep2) +print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_)) +print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_)) + diff --git a/src/doc_representation.py b/src/doc_representation.py new file mode 100644 index 0000000..9e45ebf --- /dev/null +++ b/src/doc_representation.py @@ -0,0 +1,50 @@ +import os +from os.path import join +from sklearn.feature_extraction.text import TfidfVectorizer +import numpy as np + +def load_documents(path, by_sentences=False): + #read documents + docs,y = [],[] + for file in os.listdir(path): + if file.startswith('EpistolaXIII_'): continue + file_clean = file.replace('.txt','') + author, textname = file_clean.split('_')[0],file_clean.split('_')[1] + if by_sentences: + lines = open(join(path, file)).readlines() + docs.extend(lines) + if author == 'Dante': + y.extend([1] * len(lines)) + else: + y.extend([0] * len(lines)) + else: + docs.append(open(join(path,file)).read()) + if author == 'Dante': + y.append(1) + else: + y.append(0) + + if not by_sentences: + y = y + y + docs = docs + docs + + if by_sentences: + ep1 = open(join(path, 'EpistolaXIII_1.txt')).readlines() + ep2 = open(join(path, 'EpistolaXIII_2.txt')).readlines() + else: + ep1 = [open(join(path, 'EpistolaXIII_1.txt' )).read()] + ep2 = [open(join(path, 'EpistolaXIII_2.txt')).read()] + + # document representation + tfidf = TfidfVectorizer(sublinear_tf=True) + X = tfidf.fit_transform(docs) + y = np.array(y) + Epistola1 = tfidf.transform(ep1) + Epistola2 = tfidf.transform(ep2) + + print('documents read, shape={}'.format(X.shape)) + # print(y) + + return X, y, Epistola1, Epistola2 + +