diff --git a/src/author_attribution.py b/src/author_attribution.py old mode 100644 new mode 100755 index 6de0ba9..de06fab --- a/src/author_attribution.py +++ b/src/author_attribution.py @@ -1,3 +1,4 @@ +from mpl_toolkits.axes_grid1 import make_axes_locatable from sklearn.linear_model import LogisticRegression from data.dante_loader import load_texts from data.features import * @@ -6,60 +7,104 @@ import numpy as np import matplotlib import matplotlib.pyplot as plt -def plot_attribution(path, authors, attributions, paragraph_offset=1): +def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3): - paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[1]-1)] + attributions = attributions.T + print(attributions.shape) + # attributions=attributions>0.5 + paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)] - fig, ax = plt.subplots() - im = ax.imshow(attributions) + fig, ax = plt.subplots(figsize=figsize) + + # im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens') + im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys') + + # Create colorbar + # cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04) + # ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1) + # ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05) + + # cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom") # We want to show all ticks... - ax.set_xticks(np.arange(len(paragraphs))) - ax.set_yticks(np.arange(len(authors))) + # ax.set_xticks(np.arange(len(authors))) + ax.set_xticks(np.arange(len(authors) + 0) + label_offset) + ax.set_yticks(np.arange(len(paragraphs))) # ... and label them with the respective list entries - ax.set_xticklabels(paragraphs) - ax.set_yticklabels(authors) + ax.set_xticklabels(authors) + ax.set_yticklabels(paragraphs) + + ax.tick_params(top=False, bottom=False, + labeltop=True, labelbottom=False) # Rotate the tick labels and set their alignment. - plt.setp(ax.get_xticklabels(), rotation=45, ha="right", - rotation_mode="anchor") + plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor") + + for edge, spine in ax.spines.items(): + spine.set_visible(False) + + ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True) + ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True) + + ax.grid(which="minor", color="k", linestyle='-', linewidth=1) + ax.tick_params(which="minor", bottom=False, left=False) # Loop over data dimensions and create text annotations. - for i in range(len(authors)): - for j in range(len(paragraphs)): - text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w") + # for i in range(len(authors)): + # for j in range(len(paragraphs)): + # text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w") - ax.set_title("Attribution matrix") + # ax.set_title("Attribution matrix") fig.tight_layout() # plt.show() plt.savefig(path) import sys -authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] -attributions = np.load('attribution_ep1.npy') -plot_attribution('plot1.pdf', authors, attributions) +for epistola in [1]: + if epistola == 1: + authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] + paragraph_offset = 1 + figsize=(3,9) + label_offset=0.2 + + else: + authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', + 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', + 'GrazioloBambaglioli', 'GuidoDaPisa', + 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', + 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', + 'PietroAlighieri', 'RaimundusLullus', + 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] + paragraph_offset = 14 + figsize = (6,20) + label_offset=0.3 + + attributions = np.load(f'attribution_ep{epistola}.npy') + plot_attribution(f'plot{epistola}.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset) sys.exit(0) -author_attribution = [] for epistola in [1]: + author_attribution = [] print(f'Epistola {epistola}') print('='*80) path = f'../testi_{epistola}' if epistola == 1: authors = ['Dante', 'ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] - paragraphs = range(1,3) + paragraphs = range(1,14) else: - authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'ClaraAssisiensis', + authors = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', 'GrazioloBambaglioli', 'GuidoDaPisa', 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', - 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PierDellaVigna', + 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', 'PietroAlighieri', 'RaimundusLullus', 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] - paragraphs = range(13, 90) + paragraphs = range(14, 91) + assert len(authors)==20, f'unexpected number of authors ({len(authors)})' + path+='_tutti' discarded = 0 f1_scores = [] @@ -72,9 +117,9 @@ for epistola in [1]: target = [f'EpistolaXIII_{epistola}.txt'] + [f'EpistolaXIII_{epistola}_{paragraph}.txt' for paragraph in paragraphs] positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target) - if len(positive) < 2: - discarded += 1 - continue + # if len(positive) < 2: + # discarded += 1 + # continue n_full_docs = len(positive) + len(negative) diff --git a/src/author_attribution_XIV.py b/src/author_attribution_XIV.py new file mode 100755 index 0000000..ec1fc19 --- /dev/null +++ b/src/author_attribution_XIV.py @@ -0,0 +1,152 @@ +from mpl_toolkits.axes_grid1 import make_axes_locatable +from sklearn.linear_model import LogisticRegression +from data.dante_loader import load_texts +from data.features import * +from model import AuthorshipVerificator, f1_from_counters +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +def plot_attribution(path, authors, attributions, paragraph_offset=1, figsize=(5,5), label_offset=0.3): + + attributions = attributions.T + print(attributions.shape) + # attributions=attributions>0.5 + paragraphs = ["Full"] + [f'{paragraph_offset+i}' for i in range(attributions.shape[0]-1)] + + fig, ax = plt.subplots(figsize=figsize) + + # im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greens') + im = ax.imshow(attributions, vmin=0, vmax=1, cmap='Greys') + + # Create colorbar + # cbar = fig.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1, pad=0.04) + # ax.figure.colorbar(im, ax=ax, orientation="horizontal", fraction=0.1) + # ax.figure.colorbar(im, ax=ax, orientation="horizontal", pad=0.05) + + # cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom") + + # We want to show all ticks... + # ax.set_xticks(np.arange(len(authors))) + ax.set_xticks(np.arange(len(authors) + 0) + label_offset) + ax.set_yticks(np.arange(len(paragraphs))) + # ... and label them with the respective list entries + ax.set_xticklabels(authors) + ax.set_yticklabels(paragraphs) + + ax.tick_params(top=False, bottom=False, + labeltop=True, labelbottom=False) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=90, ha="left", rotation_mode="anchor") + + for edge, spine in ax.spines.items(): + spine.set_visible(False) + + ax.set_xticks(np.arange(len(authors)+1) - .5, minor=True) + ax.set_yticks(np.arange(len(paragraphs)+1) - .5, minor=True) + + ax.grid(which="minor", color="k", linestyle='-', linewidth=1) + ax.tick_params(which="minor", bottom=False, left=False) + + # Loop over data dimensions and create text annotations. + # for i in range(len(authors)): + # for j in range(len(paragraphs)): + # text = ax.text(j, i, f'{attributions[i, j]:.2f}', ha="center", va="center", color="w") + + # ax.set_title("Attribution matrix") + fig.tight_layout() + # plt.show() + plt.savefig(path) + +import sys +authors1 = ['ClaraAssisiensis', 'GiovanniBoccaccio', 'GuidoFaba', 'PierDellaVigna'] +authors2 = ['BeneFlorentinus', 'BenvenutoDaImola', 'BoncompagnoDaSigna', + 'FilippoVillani', 'GiovanniBoccaccio', 'GiovanniDelVirgilio', + 'GrazioloBambaglioli', 'GuidoDaPisa', + 'GuidoDeColumnis', 'GuidoFaba', 'IacobusDeVaragine', 'IohannesDeAppia', + 'IohannesDePlanoCarpini', 'IulianusDeSpira', 'NicolaTrevet', + 'PietroAlighieri', 'RaimundusLullus', + 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] +authors3 = sorted(np.unique(authors1 + authors2).tolist()) + +for epistola in [1]: + paragraph_offset = 1 + label_offset = 0.2 + if epistola == 1: + authors = ['Dante'] + authors1 + figsize = (4, 4) + elif epistola == 2: + authors = ['Dante'] + authors2 + figsize = (6, 4) + else: + authors = ['Dante'] + authors3 + + attributions = np.load(f'attribution_ep{epistola}_xiv.npy') + plot_attribution(f'plot{epistola}_xiv.png', authors, attributions, paragraph_offset=paragraph_offset, figsize=figsize, label_offset=label_offset) +sys.exit(0) + +for epistola in [1]: + + author_attribution = [] + print(f'Epistola {epistola}') + print('='*80) + path = f'../testiXIV_{epistola}' + + + if epistola == 1: + authors = ['Dante'] + authors1 + elif epistola == 2: + authors = ['Dante'] + authors2 + path += '_tutti' + else: + authors = ['Dante'] + authors3 + + discarded = 0 + f1_scores = [] + counters = [] + for i, author in enumerate(authors): + print('=' * 80) + print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors))) + print('Corpus of Epistola {}'.format(epistola)) + print('=' * 80) + + target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in range(1,6)] + positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=target, train_skip_prefix='Epistola_ArigoVII') + + n_full_docs = len(positive) + len(negative) + + feature_extractor = FeatureExtractor(function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + features_sentenceLengths=True, + tfidf_feat_selection_ratio=0.1, + wordngrams=True, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), + preserve_punctuation=False, + split_documents=True, split_policy=split_by_sentences, window_size=3, + normalize_features=True) + + Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) + + print('Fitting the Verificator') + av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author) + av.fit(Xtr, ytr, groups) + + attributions=[] + for i,target_text in enumerate(ep_texts): + ep = feature_extractor.transform(target_text, avoid_splitting=True) + prob,_ = av.predict_proba(ep, epistola_name=target[i]) + attributions.append(prob) + author_attribution.append(attributions) + + author_attribution = np.asarray(author_attribution) + attribution_path = f'attribution_ep{epistola}_xiv.npy' + print(f'saving attribution matrix of shape {author_attribution.shape} in {attribution_path}') + np.save(attribution_path, author_attribution) + + + + + + diff --git a/src/author_verification_XIV.py b/src/author_verification_XIV.py new file mode 100755 index 0000000..ad1b7ce --- /dev/null +++ b/src/author_verification_XIV.py @@ -0,0 +1,77 @@ +from sklearn.linear_model import LogisticRegression +from data.dante_loader import load_texts +from data.features import * +from model import AuthorshipVerificator, f1_from_counters +from sklearn.svm import LinearSVC, SVC +from util.color_visualization import color +import pickle +import os + +for epistola in [1,2,3]: #3 means "both Ep1 and Ep2 corpora" + + print('Epistola {}'.format(epistola)) + print('='*80) + path = '../testiXIV_{}'.format(epistola) + paragraphs = range(1, 6) + if epistola==2: + path+='_tutti' + + target = [f'Epistola_ArigoVII.txt'] + [f'Epistola_ArigoVII_{paragraph}.txt' for paragraph in paragraphs] + positive, negative, ep_texts = load_texts(path, positive_author='Dante', unknown_target=target, train_skip_prefix='Epistola_ArigoVII') + + pickle_file = f'../dante_color/epistola{epistola}_xiv.pkl' + if os.path.exists(pickle_file): + print(f'loading pickle file {pickle_file}') + probabilities = pickle.load(open(pickle_file, 'rb')) + else: + print(f'generating pickle file') + n_full_docs = len(positive) + len(negative) + + feature_extractor = FeatureExtractor(function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + features_sentenceLengths=True, + tfidf_feat_selection_ratio=0.1, + wordngrams=True, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), + preserve_punctuation=False, + split_documents=True, split_policy=split_by_sentences, window_size=3, + normalize_features=True) + + Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) + print(ytr) + + print('Fitting the Verificator') + av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name='Dante') + av.fit(Xtr,ytr,groups) + + probabilities = [] + for i, target_text in enumerate(ep_texts): + ep = feature_extractor.transform(target_text, avoid_splitting=True) + prob, _ = av.predict_proba(ep, epistola_name=target[i]) + probabilities.append(prob) + + pickle.dump(probabilities, open(pickle_file, 'wb'), pickle.HIGHEST_PROTOCOL) + + color(path=f'../dante_color/epistola{epistola}_xiv.html', texts=ep_texts, + probabilities=probabilities, title=f'Epistola {epistola}', + paragraph_offset=paragraphs[0]) + + + # print('Predicting the Epistola {}'.format(epistola)) + # title = 'Epistola {}'.format('I' if epistola==1 else 'II') + # av.predict(ep, title) + # fulldoc_prob, fragment_probs = av.predict_proba(ep, title) + # color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title) + + # score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False) + # print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + + # score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True) + # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + # f1_ = f1_from_counters(tp, fp, fn, tn) + # print('F1 = {:.3f}'.format(f1_)) + + # score_ave, score_std = av.leave_one_out(Xtr, ytr, None) + # print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + diff --git a/src/util/colormap.py b/src/util/colormap.py new file mode 100755 index 0000000..d4e08d5 --- /dev/null +++ b/src/util/colormap.py @@ -0,0 +1,35 @@ +import numpy as np +import matplotlib.pyplot as plt + + +# Have colormaps separated into categories: +# http://matplotlib.org/examples/color/colormaps_reference.html +cmaps = [('Diverging', ['RdYlGn','RdYlGn']),] + + +nrows = max(len(cmap_list) for cmap_category, cmap_list in cmaps) +gradient = np.linspace(0.25, 0.75, 256) +gradient = np.vstack((gradient, gradient)) + + +def plot_color_gradients(cmap_category, cmap_list, nrows): + fig, axes = plt.subplots(nrows=nrows) + fig.subplots_adjust(top=0.95, bottom=0.01, left=0.2, right=0.99) + axes[0].set_title(cmap_category + ' colormaps', fontsize=14) + + for ax, name in zip(axes, cmap_list): + ax.imshow(gradient, aspect='auto', cmap=plt.get_cmap(name)) + pos = list(ax.get_position().bounds) + x_text = pos[0] - 0.01 + y_text = pos[1] + pos[3]/2. + fig.text(x_text, y_text, name, va='center', ha='right', fontsize=10) + + # Turn off *all* ticks & spines, not just the ones with colormaps. + for ax in axes: + ax.set_axis_off() + + +for cmap_category, cmap_list in cmaps: + plot_color_gradients(cmap_category, cmap_list, nrows) + +plt.show() \ No newline at end of file diff --git a/src/weight_inspection.py b/src/weight_inspection.py new file mode 100755 index 0000000..410b077 --- /dev/null +++ b/src/weight_inspection.py @@ -0,0 +1,59 @@ +from sklearn.linear_model import LogisticRegression +from data.dante_loader import load_texts +from data.features import * +from model import AuthorshipVerificator, f1_from_counters +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +for epistola in [2]: + + author_attribution = [] + print(f'Epistola {epistola}') + print('='*80) + path = f'../testi_{epistola}' + if epistola==2: path+='_tutti' + + author = 'Dante' + print('=' * 80) + print('Corpus of Epistola {}'.format(epistola)) + print('=' * 80) + + positive, negative, ep_texts = load_texts(path, positive_author=author, unknown_target=f'EpistolaXIII_{epistola}.txt') + + n_full_docs = len(positive) + len(negative) + + feature_extractor = FeatureExtractor(function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + features_sentenceLengths=True, + tfidf_feat_selection_ratio=0.1, + wordngrams=True, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), + preserve_punctuation=False, + split_documents=True, split_policy=split_by_sentences, window_size=3, + normalize_features=True) + + Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) + + print('Fitting the Verificator') + av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression, author_name=author) + av.fit(Xtr, ytr, groups) + + feat_rank = np.argsort(av.estimator.coef_[0]) + coef_ordered = av.estimator.coef_[0][feat_rank] + feat_name_ordered = feature_extractor.feature_names[feat_rank] + + print('Most Dantesque features::') + for i in range(100): + print(f'{i}: {feat_name_ordered[::-1][i]} {coef_ordered[::-1][i]:.3f}') + + print('\nMost Non-Dantesque features::') + for i in range(100): + print(f'{i}: {feat_name_ordered[i]} {coef_ordered[i]:.3f}') + + + print('done') + + +