Features sentence lengths

This commit is contained in:
Silvia Corbara 2019-01-23 23:39:58 +01:00
parent 1387ef2c59
commit 695dcff635
6 changed files with 74 additions and 40 deletions

Binary file not shown.

View File

@ -13,7 +13,7 @@ from util.color_visualization import color
# TODO: sentence length (Mendenhall-style) ?
for epistola in [2]:
for epistola in [1,2]:
if epistola==1:
authors = ['Dante','GiovanniBoccaccio','PierDellaVigna']
else:
@ -42,6 +42,7 @@ for epistola in [2]:
feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
tfidf_feat_selection_ratio=0.1,
wordngrams=False, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False,

View File

@ -13,7 +13,7 @@ from util.color_visualization import color
# TODO: sentence length (Mendenhall-style) ?
for epistola in [1, 2]:
for epistola in [1,2]:
print('Epistola {}'.format(epistola))
print('='*80)
@ -27,9 +27,10 @@ for epistola in [1, 2]:
feature_extractor = FeatureExtractor(function_words_freq='latin',
conjugations_freq='latin',
features_Mendenhall=True,
features_sentenceLengths=True,
tfidf_feat_selection_ratio=0.1,
wordngrams=False, n_wordngrams=(1, 2),
charngrams=True, n_charngrams=(2, 3, 4), preserve_punctuation=False,
charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False,
split_documents=True, split_policy=split_by_sentences, window_size=3,
normalize_features=True)

Binary file not shown.

Binary file not shown.

View File

@ -5,40 +5,39 @@ from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import normalize
from scipy.sparse import hstack, csr_matrix, issparse
import collections
from nltk.corpus import stopwords
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'vt', 'cvm', 'per', 'a', 'sed', 'qve', 'qvia', 'ex', 'sic',
'si', 'etiam', 'idest', 'nam', 'vnde', 'ab', 'vel', 'sicvt', 'ita', 'enim', 'scilicet', 'nec',
'pro', 'avtem', 'ibi', 'dvm', 'vero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'svb',
'qvomodo', 'vbi', 'svper', 'iam', 'tam', 'hec', 'post', 'qvasi', 'ergo', 'inde', 'e', 'tvnc',
'atqve', 'ac', 'sine', 'nisi', 'nvnc', 'qvando', 'ne', 'vsqve', 'sive', 'avt', 'igitvr', 'circa',
'qvidem', 'svpra', 'ante', 'adhvc', 'sev' , 'apvd', 'olim', 'statim', 'satis', 'ob', 'qvoniam',
'postea', 'nvnqvam']
latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic',
'si', 'etiam', 'idest', 'nam', 'unde', 'ab', 'uel', 'sicut', 'ita', 'enim', 'scilicet', 'nec',
'pro', 'autem', 'ibi', 'dum', 'uero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'sub',
'quomodo', 'ubi', 'super', 'iam', 'tam', 'hec', 'post', 'quasi', 'ergo', 'inde', 'e', 'tunc',
'atque', 'ac', 'sine', 'nisi', 'nunc', 'quando', 'ne', 'usque', 'siue', 'aut', 'igitur', 'circa',
'quidem', 'supra', 'ante', 'adhuc', 'seu' , 'apud', 'olim', 'statim', 'satis', 'ob', 'quoniam',
'postea', 'nunquam']
latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amvs', 'emvs', 'imvs', 'atis', 'etis',
'itis', 'ant', 'ent', 'vnt', 'ivnt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atvr', 'etvr',
'itvr', 'amvr', 'emvr', 'imvr', 'amini', 'emini', 'imini', 'antvr', 'entvr', 'vntvr', 'ivntvr',
'abam', 'ebam', 'iebam', 'abas', 'ebas', 'iebas', 'abat', 'ebat', 'iebat', 'abamvs', 'ebamvs',
'iebamvs', 'abatis', 'ebatis', 'iebatis', 'abant', 'ebant', 'iebant', 'abar', 'ebar', 'iebar',
'abaris', 'ebaris', 'iebaris', 'abatvr', 'ebatvr', 'iebatvr', 'abamvr', 'ebamvr', 'iebamvr',
'abamini', 'ebamini', 'iebamini', 'abantvr', 'ebantvr', 'iebantvr', 'abo', 'ebo', 'am', 'iam',
'abis', 'ebis', 'ies', 'abit', 'ebit', 'iet', 'abimvs', 'ebimvs', 'emvs', 'iemvs', 'abitis',
'ebitis', 'ietis', 'abvnt', 'ebvnt', 'ient', 'abor', 'ebor', 'ar', 'iar', 'aberis', 'eberis',
'ieris', 'abitvr', 'ebitvr', 'ietvr', 'abimvr', 'ebimvr', 'iemvr', 'abimini', 'ebimini', 'iemini',
'abvntvr', 'ebvntvr', 'ientvr', 'i', 'isti', 'it', 'imvs', 'istis', 'ervnt', 'em', 'eam', 'eas',
'ias', 'eat', 'iat', 'eamvs', 'iamvs', 'eatis', 'iatis', 'eant', 'iant', 'er', 'ear', 'earis',
'iaris', 'eatvr', 'iatvr', 'eamvr', 'iamvr', 'eamini', 'iamini', 'eantvr', 'iantvr', 'rem', 'res',
'ret', 'remvs', 'retis', 'rent', 'rer', 'reris', 'retvr', 'remvr', 'remini', 'rentvr', 'erim',
'issem', 'isses', 'isset', 'issemvs', 'issetis', 'issent', 'a', 'ate', 'e', 'ete', 'ite', 'are',
'ere', 'ire', 'ato', 'eto', 'ito', 'atote', 'etote', 'itote', 'anto', 'ento', 'vnto', 'ivnto',
'ator', 'etor', 'itor', 'aminor', 'eminor', 'iminor', 'antor', 'entor', 'vntor', 'ivntor', 'ari',
'eri', 'iri', 'andi', 'ando', 'andvm', 'andvs', 'ande', 'ans', 'antis', 'anti', 'antem', 'antes',
'antivm', 'antibvs', 'antia', 'esse', 'svm', 'es', 'est', 'svmvs', 'estis', 'svnt', 'eram', 'eras',
'erat', 'eramvs', 'eratis', 'erant', 'ero', 'eris', 'erit', 'erimvs', 'eritis', 'erint', 'sim',
'sis', 'sit', 'simvs', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemvs', 'essetis', 'essent',
'fvi', 'fvisti', 'fvit', 'fvimvs', 'fvistis', 'fvervnt', 'este', 'esto', 'estote', 'svnto']
latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus', 'emus', 'imus', 'atis', 'etis',
'itis', 'ant', 'ent', 'unt', 'iunt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atur', 'etur',
'itur', 'amur', 'emur', 'imur', 'amini', 'emini', 'imini', 'antur', 'entur', 'untur', 'iuntur',
'abam', 'ebam', 'iebam', 'abas', 'ebas', 'iebas', 'abat', 'ebat', 'iebat', 'abamus', 'ebamus',
'iebamus', 'abatis', 'ebatis', 'iebatis', 'abant', 'ebant', 'iebant', 'abar', 'ebar', 'iebar',
'abaris', 'ebaris', 'iebaris', 'abatur', 'ebatur', 'iebatur', 'abamur', 'ebamur', 'iebamur',
'abamini', 'ebamini', 'iebamini', 'abantur', 'ebantur', 'iebantur', 'abo', 'ebo', 'am', 'iam',
'abis', 'ebis', 'ies', 'abit', 'ebit', 'iet', 'abimus', 'ebimus', 'emus', 'iemus', 'abitis',
'ebitis', 'ietis', 'abunt', 'ebunt', 'ient', 'abor', 'ebor', 'ar', 'iar', 'aberis', 'eberis',
'ieris', 'abitur', 'ebitur', 'ietur', 'abimur', 'ebimur', 'iemur', 'abimini', 'ebimini', 'iemini',
'abuntur', 'ebuntur', 'ientur', 'i', 'isti', 'it', 'imus', 'istis', 'erunt', 'em', 'eam', 'eas',
'ias', 'eat', 'iat', 'eamus', 'iamus', 'eatis', 'iatis', 'eant', 'iant', 'er', 'ear', 'earis',
'iaris', 'eatur', 'iatur', 'eamur', 'iamur', 'eamini', 'iamini', 'eantur', 'iantur', 'rem', 'res',
'ret', 'remus', 'retis', 'rent', 'rer', 'reris', 'retur', 'remur', 'remini', 'rentur', 'erim',
'issem', 'isses', 'isset', 'issemus', 'issetis', 'issent', 'a', 'ate', 'e', 'ete', 'ite', 'are',
'ere', 'ire', 'ato', 'eto', 'ito', 'atote', 'etote', 'itote', 'anto', 'ento', 'unto', 'iunto',
'ator', 'etor', 'itor', 'aminor', 'eminor', 'iminor', 'antor', 'entor', 'untor', 'iuntor', 'ari',
'eri', 'iri', 'andi', 'ando', 'andum', 'andus', 'ande', 'ans', 'antis', 'anti', 'antem', 'antes',
'antium', 'antibus', 'antia', 'esse', 'sum', 'es', 'est', 'sumus', 'estis', 'sunt', 'eram', 'eras',
'erat', 'eramus', 'eratis', 'erant', 'ero', 'eris', 'erit', 'erimus', 'eritis', 'erint', 'sim',
'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent',
'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto']
spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','en','imos','ís','guir','ger','gir',
'ar', 'er', 'ir', 'é', 'aste', 'ó','asteis','aron','í','iste','','isteis','ieron',
@ -167,22 +166,45 @@ def _features_Mendenhall(documents, upto=23):
:param documents: a list where each element is the text (string) of a document
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
"""
features = []
for text in documents:
unmod_tokens = nltk.word_tokenize(text)
mod_tokens = ([token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)])
nwords = len(mod_tokens)
tokens_len = [len(token) for token in mod_tokens]
count = collections.Counter(tokens_len)
features.append([1000.*count[i]/nwords for i in range(1,upto)])
tokens_count = []
for i in range(1, upto):
tokens_count.append(1000.*(sum(j>= i for j in tokens_len))/nwords)
features.append(tokens_count)
return np.array(features)
def _features_sentenceLengths(documents, downto=3, upto=70):
"""
Extract features as the length of the sentences, ie. number of words in the sentence.
:param documents: a list where each element is the text (string) of a document
:param downto: minimal length considered
:param upto: maximum length considered
:return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered)
"""
features = []
for text in documents:
sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()]
nsent = len(sentences)
sent_len = []
sent_count = []
for sentence in sentences:
unmod_tokens = nltk.tokenize.word_tokenize(sentence)
mod_tokens = ([token for token in unmod_tokens if any(char.isalpha() for char in token)])
sent_len.append(len(mod_tokens))
for i in range(downto, upto):
sent_count.append(1000.*(sum(j>= i for j in sent_len))/nsent)
features.append(sent_count)
return np.array(features)
def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)):
"""
Extract features as tfidf matrix extracted from the documents
@ -238,6 +260,7 @@ class FeatureExtractor:
function_words_freq=None,
conjugations_freq=None,
features_Mendenhall=True,
features_sentenceLengths=True,
wordngrams=False,
tfidf_feat_selection_ratio=1.,
n_wordngrams=(1, 1),
@ -271,6 +294,7 @@ class FeatureExtractor:
self.function_words_freq = function_words_freq
self.conjugations_freq = conjugations_freq
self.features_Mendenhall = features_Mendenhall
self.features_sentenceLengths = features_sentenceLengths
self.tfidf = wordngrams
self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio
self.wordngrams = n_wordngrams
@ -319,6 +343,10 @@ class FeatureExtractor:
X = self._addfeatures(X, _features_Mendenhall(documents))
self._print('adding Mendenhall words features: {} features'.format(X.shape[1]))
if self.features_sentenceLengths:
X = self._addfeatures(X, _features_sentenceLengths(documents))
self._print('adding sentence lengths features: {} features'.format(X.shape[1]))
# sparse feature extraction functions
if self.tfidf:
X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams)
@ -384,6 +412,10 @@ class FeatureExtractor:
TEST = self._addfeatures(TEST, _features_Mendenhall(test))
self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1]))
if self.features_sentenceLengths:
TEST = self._addfeatures(TEST, _features_sentenceLengths(test))
self._print('adding sentence lengths features: {} features'.format(TEST.shape[1]))
# sparse feature extraction functions
if self.tfidf:
ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer)