example code

2018-10-15 14:26:39 +02:00 · 2018-10-15 14:26:39 +02:00 · 634a8a099b
parent e18de4e2dc
commit 634a8a099b
2 changed files with 93 additions and 0 deletions
--- a/src/classifier.py
+++ b/src/classifier.py
@ -0,0 +1,43 @@
 from sklearn.svm import LinearSVC
 from sklearn.model_selection import cross_val_score, GridSearchCV
 from sklearn.feature_selection import SelectKBest, chi2
 from doc_representation import *
 nfolds = 2
 do_feat_selection = True
 params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],'class_weight':['balanced',None]}]
 path = '/home/moreo/Dante/testi'
 Xtr,ytr,ep1,ep2 = load_documents(path, by_sentences=True)
 if do_feat_selection:
    print('feature selection')
    num_feats = int(0.1 * Xtr.shape[1])
    feature_selector = SelectKBest(chi2, k=num_feats)
    Xtr = feature_selector.fit_transform(Xtr,ytr)
    print('final shape={}'.format(Xtr.shape))
    ep1 = feature_selector.transform(ep1)
    ep2 = feature_selector.transform(ep2)
 # learn a SVM
 print('optimizing a SVM')
 svm_base = LinearSVC()
 svm_optimized = GridSearchCV(svm_base, param_grid=params, cv=nfolds)
 svm_optimized.fit(Xtr,ytr)
 print('Best params: {}'.format(svm_optimized.best_params_))
 # evaluation of results
 print('computng the cross-val score')
 accuracies = cross_val_score(svm_optimized, Xtr, ytr, cv=nfolds, n_jobs=-1)
 acc_mean, acc_std = accuracies.mean(), accuracies.std()
 print('Accuracy={:.3f} (+-{:.3f})'.format(acc_mean, acc_std))
 # final test
 print('predicting the Epistolas')
 ep1_ = svm_optimized.predict(ep1)
 ep2_ = svm_optimized.predict(ep2)
 print('Epistola1 acc = {:.3f} {}'.format(ep1_.mean(), ep1_))
 print('Epistola2 acc = {:.3f} {}'.format(ep2_.mean(), ep2_))
--- a/src/doc_representation.py
+++ b/src/doc_representation.py
@ -0,0 +1,50 @@
 import os
 from os.path import join
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
 def load_documents(path, by_sentences=False):
    #read documents
    docs,y = [],[]
    for file in os.listdir(path):
        if file.startswith('EpistolaXIII_'): continue
        file_clean = file.replace('.txt','')
        author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
        if by_sentences:
            lines = open(join(path, file)).readlines()
            docs.extend(lines)
            if author == 'Dante':
                y.extend([1] * len(lines))
            else:
                y.extend([0] * len(lines))
        else:
            docs.append(open(join(path,file)).read())
            if author == 'Dante':
                y.append(1)
            else:
                y.append(0)
    if not by_sentences:
        y = y + y
        docs = docs + docs
    if by_sentences:
        ep1 = open(join(path, 'EpistolaXIII_1.txt')).readlines()
        ep2 = open(join(path, 'EpistolaXIII_2.txt')).readlines()
    else:
        ep1 = [open(join(path, 'EpistolaXIII_1.txt' )).read()]
        ep2 = [open(join(path, 'EpistolaXIII_2.txt')).read()]
    # document representation
    tfidf = TfidfVectorizer(sublinear_tf=True)
    X = tfidf.fit_transform(docs)
    y = np.array(y)
    Epistola1 = tfidf.transform(ep1)
    Epistola2 = tfidf.transform(ep2)
    print('documents read, shape={}'.format(X.shape))
    # print(y)
    return X, y, Epistola1, Epistola2