diff --git a/src/__pycache__/model.cpython-36.pyc b/src/__pycache__/model.cpython-36.pyc index c4b8f40..a9ab2bf 100644 Binary files a/src/__pycache__/model.cpython-36.pyc and b/src/__pycache__/model.cpython-36.pyc differ diff --git a/src/author_identification.py b/src/author_identification.py index 3df2892..d058c3f 100644 --- a/src/author_identification.py +++ b/src/author_identification.py @@ -13,7 +13,7 @@ from util.color_visualization import color # TODO: sentence length (Mendenhall-style) ? -for epistola in [2]: +for epistola in [1,2]: if epistola==1: authors = ['Dante','GiovanniBoccaccio','PierDellaVigna'] else: @@ -42,6 +42,7 @@ for epistola in [2]: feature_extractor = FeatureExtractor(function_words_freq='latin', conjugations_freq='latin', features_Mendenhall=True, + features_sentenceLengths=True, tfidf_feat_selection_ratio=0.1, wordngrams=False, n_wordngrams=(1, 2), charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, diff --git a/src/author_verification.py b/src/author_verification.py index 53218be..159b68c 100644 --- a/src/author_verification.py +++ b/src/author_verification.py @@ -13,7 +13,7 @@ from util.color_visualization import color # TODO: sentence length (Mendenhall-style) ? -for epistola in [1, 2]: +for epistola in [1,2]: print('Epistola {}'.format(epistola)) print('='*80) @@ -27,9 +27,10 @@ for epistola in [1, 2]: feature_extractor = FeatureExtractor(function_words_freq='latin', conjugations_freq='latin', features_Mendenhall=True, + features_sentenceLengths=True, tfidf_feat_selection_ratio=0.1, wordngrams=False, n_wordngrams=(1, 2), - charngrams=True, n_charngrams=(2, 3, 4), preserve_punctuation=False, + charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, split_documents=True, split_policy=split_by_sentences, window_size=3, normalize_features=True) diff --git a/src/data/__pycache__/dante_loader.cpython-36.pyc b/src/data/__pycache__/dante_loader.cpython-36.pyc new file mode 100644 index 0000000..b67149f Binary files /dev/null and b/src/data/__pycache__/dante_loader.cpython-36.pyc differ diff --git a/src/data/__pycache__/features.cpython-36.pyc b/src/data/__pycache__/features.cpython-36.pyc new file mode 100644 index 0000000..0adc9f3 Binary files /dev/null and b/src/data/__pycache__/features.cpython-36.pyc differ diff --git a/src/data/features.py b/src/data/features.py index 576753f..d30009b 100644 --- a/src/data/features.py +++ b/src/data/features.py @@ -5,40 +5,39 @@ from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import normalize from scipy.sparse import hstack, csr_matrix, issparse -import collections from nltk.corpus import stopwords -latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'vt', 'cvm', 'per', 'a', 'sed', 'qve', 'qvia', 'ex', 'sic', - 'si', 'etiam', 'idest', 'nam', 'vnde', 'ab', 'vel', 'sicvt', 'ita', 'enim', 'scilicet', 'nec', - 'pro', 'avtem', 'ibi', 'dvm', 'vero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'svb', - 'qvomodo', 'vbi', 'svper', 'iam', 'tam', 'hec', 'post', 'qvasi', 'ergo', 'inde', 'e', 'tvnc', - 'atqve', 'ac', 'sine', 'nisi', 'nvnc', 'qvando', 'ne', 'vsqve', 'sive', 'avt', 'igitvr', 'circa', - 'qvidem', 'svpra', 'ante', 'adhvc', 'sev' , 'apvd', 'olim', 'statim', 'satis', 'ob', 'qvoniam', - 'postea', 'nvnqvam'] +latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic', + 'si', 'etiam', 'idest', 'nam', 'unde', 'ab', 'uel', 'sicut', 'ita', 'enim', 'scilicet', 'nec', + 'pro', 'autem', 'ibi', 'dum', 'uero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'sub', + 'quomodo', 'ubi', 'super', 'iam', 'tam', 'hec', 'post', 'quasi', 'ergo', 'inde', 'e', 'tunc', + 'atque', 'ac', 'sine', 'nisi', 'nunc', 'quando', 'ne', 'usque', 'siue', 'aut', 'igitur', 'circa', + 'quidem', 'supra', 'ante', 'adhuc', 'seu' , 'apud', 'olim', 'statim', 'satis', 'ob', 'quoniam', + 'postea', 'nunquam'] -latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amvs', 'emvs', 'imvs', 'atis', 'etis', - 'itis', 'ant', 'ent', 'vnt', 'ivnt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atvr', 'etvr', - 'itvr', 'amvr', 'emvr', 'imvr', 'amini', 'emini', 'imini', 'antvr', 'entvr', 'vntvr', 'ivntvr', - 'abam', 'ebam', 'iebam', 'abas', 'ebas', 'iebas', 'abat', 'ebat', 'iebat', 'abamvs', 'ebamvs', - 'iebamvs', 'abatis', 'ebatis', 'iebatis', 'abant', 'ebant', 'iebant', 'abar', 'ebar', 'iebar', - 'abaris', 'ebaris', 'iebaris', 'abatvr', 'ebatvr', 'iebatvr', 'abamvr', 'ebamvr', 'iebamvr', - 'abamini', 'ebamini', 'iebamini', 'abantvr', 'ebantvr', 'iebantvr', 'abo', 'ebo', 'am', 'iam', - 'abis', 'ebis', 'ies', 'abit', 'ebit', 'iet', 'abimvs', 'ebimvs', 'emvs', 'iemvs', 'abitis', - 'ebitis', 'ietis', 'abvnt', 'ebvnt', 'ient', 'abor', 'ebor', 'ar', 'iar', 'aberis', 'eberis', - 'ieris', 'abitvr', 'ebitvr', 'ietvr', 'abimvr', 'ebimvr', 'iemvr', 'abimini', 'ebimini', 'iemini', - 'abvntvr', 'ebvntvr', 'ientvr', 'i', 'isti', 'it', 'imvs', 'istis', 'ervnt', 'em', 'eam', 'eas', - 'ias', 'eat', 'iat', 'eamvs', 'iamvs', 'eatis', 'iatis', 'eant', 'iant', 'er', 'ear', 'earis', - 'iaris', 'eatvr', 'iatvr', 'eamvr', 'iamvr', 'eamini', 'iamini', 'eantvr', 'iantvr', 'rem', 'res', - 'ret', 'remvs', 'retis', 'rent', 'rer', 'reris', 'retvr', 'remvr', 'remini', 'rentvr', 'erim', - 'issem', 'isses', 'isset', 'issemvs', 'issetis', 'issent', 'a', 'ate', 'e', 'ete', 'ite', 'are', - 'ere', 'ire', 'ato', 'eto', 'ito', 'atote', 'etote', 'itote', 'anto', 'ento', 'vnto', 'ivnto', - 'ator', 'etor', 'itor', 'aminor', 'eminor', 'iminor', 'antor', 'entor', 'vntor', 'ivntor', 'ari', - 'eri', 'iri', 'andi', 'ando', 'andvm', 'andvs', 'ande', 'ans', 'antis', 'anti', 'antem', 'antes', - 'antivm', 'antibvs', 'antia', 'esse', 'svm', 'es', 'est', 'svmvs', 'estis', 'svnt', 'eram', 'eras', - 'erat', 'eramvs', 'eratis', 'erant', 'ero', 'eris', 'erit', 'erimvs', 'eritis', 'erint', 'sim', - 'sis', 'sit', 'simvs', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemvs', 'essetis', 'essent', - 'fvi', 'fvisti', 'fvit', 'fvimvs', 'fvistis', 'fvervnt', 'este', 'esto', 'estote', 'svnto'] +latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus', 'emus', 'imus', 'atis', 'etis', + 'itis', 'ant', 'ent', 'unt', 'iunt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atur', 'etur', + 'itur', 'amur', 'emur', 'imur', 'amini', 'emini', 'imini', 'antur', 'entur', 'untur', 'iuntur', + 'abam', 'ebam', 'iebam', 'abas', 'ebas', 'iebas', 'abat', 'ebat', 'iebat', 'abamus', 'ebamus', + 'iebamus', 'abatis', 'ebatis', 'iebatis', 'abant', 'ebant', 'iebant', 'abar', 'ebar', 'iebar', + 'abaris', 'ebaris', 'iebaris', 'abatur', 'ebatur', 'iebatur', 'abamur', 'ebamur', 'iebamur', + 'abamini', 'ebamini', 'iebamini', 'abantur', 'ebantur', 'iebantur', 'abo', 'ebo', 'am', 'iam', + 'abis', 'ebis', 'ies', 'abit', 'ebit', 'iet', 'abimus', 'ebimus', 'emus', 'iemus', 'abitis', + 'ebitis', 'ietis', 'abunt', 'ebunt', 'ient', 'abor', 'ebor', 'ar', 'iar', 'aberis', 'eberis', + 'ieris', 'abitur', 'ebitur', 'ietur', 'abimur', 'ebimur', 'iemur', 'abimini', 'ebimini', 'iemini', + 'abuntur', 'ebuntur', 'ientur', 'i', 'isti', 'it', 'imus', 'istis', 'erunt', 'em', 'eam', 'eas', + 'ias', 'eat', 'iat', 'eamus', 'iamus', 'eatis', 'iatis', 'eant', 'iant', 'er', 'ear', 'earis', + 'iaris', 'eatur', 'iatur', 'eamur', 'iamur', 'eamini', 'iamini', 'eantur', 'iantur', 'rem', 'res', + 'ret', 'remus', 'retis', 'rent', 'rer', 'reris', 'retur', 'remur', 'remini', 'rentur', 'erim', + 'issem', 'isses', 'isset', 'issemus', 'issetis', 'issent', 'a', 'ate', 'e', 'ete', 'ite', 'are', + 'ere', 'ire', 'ato', 'eto', 'ito', 'atote', 'etote', 'itote', 'anto', 'ento', 'unto', 'iunto', + 'ator', 'etor', 'itor', 'aminor', 'eminor', 'iminor', 'antor', 'entor', 'untor', 'iuntor', 'ari', + 'eri', 'iri', 'andi', 'ando', 'andum', 'andus', 'ande', 'ans', 'antis', 'anti', 'antem', 'antes', + 'antium', 'antibus', 'antia', 'esse', 'sum', 'es', 'est', 'sumus', 'estis', 'sunt', 'eram', 'eras', + 'erat', 'eramus', 'eratis', 'erant', 'ero', 'eris', 'erit', 'erimus', 'eritis', 'erint', 'sim', + 'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent', + 'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto'] spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','en','imos','ís','guir','ger','gir', 'ar', 'er', 'ir', 'é', 'aste', 'ó','asteis','aron','í','iste','ió','isteis','ieron', @@ -167,22 +166,45 @@ def _features_Mendenhall(documents, upto=23): :param documents: a list where each element is the text (string) of a document :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) """ - features = [] - for text in documents: unmod_tokens = nltk.word_tokenize(text) mod_tokens = ([token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)]) nwords = len(mod_tokens) - tokens_len = [len(token) for token in mod_tokens] - - count = collections.Counter(tokens_len) - features.append([1000.*count[i]/nwords for i in range(1,upto)]) - + tokens_count = [] + for i in range(1, upto): + tokens_count.append(1000.*(sum(j>= i for j in tokens_len))/nwords) + features.append(tokens_count) return np.array(features) +def _features_sentenceLengths(documents, downto=3, upto=70): + """ + Extract features as the length of the sentences, ie. number of words in the sentence. + :param documents: a list where each element is the text (string) of a document + :param downto: minimal length considered + :param upto: maximum length considered + :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) + """ + features = [] + for text in documents: + sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()] + nsent = len(sentences) + sent_len = [] + sent_count = [] + for sentence in sentences: + unmod_tokens = nltk.tokenize.word_tokenize(sentence) + mod_tokens = ([token for token in unmod_tokens if any(char.isalpha() for char in token)]) + sent_len.append(len(mod_tokens)) + for i in range(downto, upto): + sent_count.append(1000.*(sum(j>= i for j in sent_len))/nsent) + features.append(sent_count) + return np.array(features) + + + + def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)): """ Extract features as tfidf matrix extracted from the documents @@ -238,6 +260,7 @@ class FeatureExtractor: function_words_freq=None, conjugations_freq=None, features_Mendenhall=True, + features_sentenceLengths=True, wordngrams=False, tfidf_feat_selection_ratio=1., n_wordngrams=(1, 1), @@ -271,6 +294,7 @@ class FeatureExtractor: self.function_words_freq = function_words_freq self.conjugations_freq = conjugations_freq self.features_Mendenhall = features_Mendenhall + self.features_sentenceLengths = features_sentenceLengths self.tfidf = wordngrams self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio self.wordngrams = n_wordngrams @@ -319,6 +343,10 @@ class FeatureExtractor: X = self._addfeatures(X, _features_Mendenhall(documents)) self._print('adding Mendenhall words features: {} features'.format(X.shape[1])) + if self.features_sentenceLengths: + X = self._addfeatures(X, _features_sentenceLengths(documents)) + self._print('adding sentence lengths features: {} features'.format(X.shape[1])) + # sparse feature extraction functions if self.tfidf: X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams) @@ -384,6 +412,10 @@ class FeatureExtractor: TEST = self._addfeatures(TEST, _features_Mendenhall(test)) self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1])) + if self.features_sentenceLengths: + TEST = self._addfeatures(TEST, _features_sentenceLengths(test)) + self._print('adding sentence lengths features: {} features'.format(TEST.shape[1])) + # sparse feature extraction functions if self.tfidf: ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer)