minor fixes and added some comments

This commit is contained in:
andrea 2020-10-28 09:52:45 +01:00
parent 6d65bfb03d
commit c084babebd
11 changed files with 34 additions and 76 deletions

View File

@ -1,4 +0,0 @@
si tratta di Bonifacio II (1150-1207) che fu uno dei capi della IV Crociata conclusasi con la presa
di Costantinopoli (1204) e la creazione dell'Impero latino di Oriente (Bonifacio ebbe il regno di Te
ssalonica). I trovatori provenzali a lui contemporanei esaltarono l'ospitalità generosa della sua co
rte (Rambaut de Vaqueiras lo elogia paragonandolo proprio ad Alessandro Magno).

View File

@ -7,7 +7,7 @@ import json
import time import time
def testing_dbpedia(author): def query_dbpedia(author):
endpoint = 'http://dbpedia.org/sparql' endpoint = 'http://dbpedia.org/sparql'
s = sparql.Service(endpoint, "utf-8", "SELECT") s = sparql.Service(endpoint, "utf-8", "SELECT")
query_author = """SELECT ?names WHERE {{ query_author = """SELECT ?names WHERE {{
@ -23,7 +23,7 @@ def testing_dbpedia(author):
return [result, results_works] return [result, results_works]
def testing_wikidata(entity_q): def query_wikidata(entity_q):
""" """
Notable work = P800 Notable work = P800
Date of birth = P569 Date of birth = P569
@ -95,11 +95,11 @@ full_auth_list = author_uri_list_convivio + author_uri_list_monarchia + author_u
dict_res = {} dict_res = {}
print(f'# Number of authors: {len(full_auth_list)}') print(f'# Number of authors: {len(full_auth_list)}')
for auth in tqdm.tqdm(full_auth_list): for auth in tqdm.tqdm(full_auth_list):
entity_q = testing_dbpedia(auth)[0] entity_q = query_dbpedia(auth)[0]
wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False) wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
dict_res[wikidata_endp] = None dict_res[wikidata_endp] = None
if wikidata_endp is not None: if wikidata_endp is not None:
_, names, works, other_works, y_birth = testing_wikidata(wikidata_endp) _, names, works, other_works, y_birth = query_wikidata(wikidata_endp)
dict_res[wikidata_endp] = {'aut_name': names, dict_res[wikidata_endp] = {'aut_name': names,
'aut_works': works, 'aut_works': works,
'aut_present_work': other_works, 'aut_present_work': other_works,

View File

@ -1,49 +0,0 @@
import pickle
from pprint import pprint
"""
with open('./KB_abs_convivio.pickle', 'rb') as infile:
kb1 = pickle.load(infile)
print(len(kb1))
with open('./KB_abs_monarchia.pickle', 'rb') as infile:
kb2 = pickle.load(infile)
print(len(kb2))
def merge_dicts(iter_dict):
merged = {}
for i, dict in enumerate(iter_dict):
if i == 0:
merged = iter_dict[i]
continue
else:
for k, v in iter_dict[i].items():
if k not in merged.keys():
merged[k] = v
return merged
merged = merge_dicts([kb1, kb2])
with open('./KB_abs_merged.pickle', 'wb') as infile:
pickle.dump(merged, infile)
from pprint import pprint
pprint(merged['Giles_of_Rome'])
"""
with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
kb = pickle.load(infile)
reversed_dict = {}
for key in kb.keys():
name_list = kb[key]['names']
to_add = {'name':'None', 'birth':'None', 'abstract':'None'}
for name in name_list:
to_add['name'] = key
to_add['birth'] = kb[key]['birth']
to_add['abstract'] = kb[key]['abstract']
reversed_dict[name] = to_add
print(len(reversed_dict))
with open('knowledge_base/KB_abs_reversed.pickle', 'wb') as outfile:
pickle.dump(reversed_dict, outfile)

View File

@ -18,15 +18,22 @@ class KnowledgeBase:
self._popolate_aut2id() self._popolate_aut2id()
def link_entities(self, preds, deepfuzz=False): def link_entities(self, preds, deepfuzz=False):
COMMEDIA_DATE = 1321
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante'] PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
# print('-'*50) print('\nMODEL RAW PREDICTIONS:')
# print(f'Candidate authors (i.e., entities matched): {PER_preds}') print(f'Candidate authors (i.e., entities matched): {PER_preds}')
# print(f'Candidates work :{WORK_preds}') print(f'Candidates work :{WORK_preds}')
print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:')
COMMEDIA_DATE = 1321
print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
"""
Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according
to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure
is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a
metric is computed beteen the original NER match and every author name present in the KB but also split on
the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as
simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned.
"""
aut_res = [] aut_res = []
for target in set(PER_preds): for target in set(PER_preds):
scores = [] scores = []
@ -40,7 +47,6 @@ class KnowledgeBase:
for i in range(3): for i in range(3):
if scores[i][1] > .8: if scores[i][1] > .8:
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}') print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
#, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
success = True success = True
aut_res.append(target) aut_res.append(target)
break break
@ -55,13 +61,18 @@ class KnowledgeBase:
deepscore.append((aut, sim)) deepscore.append((aut, sim))
deepscore.sort(key=lambda tup: tup[1], reverse=True) deepscore.sort(key=lambda tup: tup[1], reverse=True)
for j in range(3): for j in range(3):
if deepscore[j][1] > .9: if deepscore[j][1] > .8:
print( print(
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}') f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
#, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
aut_res.append(target) aut_res.append(target)
break break
"""
Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary)
according to the edit distance computed across all of the titles in the KB. The similarity measure
is computed between the match and the exact name stored in the KB.
Once sorted, the first element satisfying the given threshold is returned.
"""
work_res = {} work_res = {}
if len(WORK_preds) != 0: if len(WORK_preds) != 0:
print('-' * 50 + '\n### Works matches:') print('-' * 50 + '\n### Works matches:')
@ -72,7 +83,7 @@ class KnowledgeBase:
scores_work.append((work, sim)) scores_work.append((work, sim))
scores_work.sort(key=lambda tup: tup[1], reverse=True) scores_work.sort(key=lambda tup: tup[1], reverse=True)
for i in range(3): for i in range(3):
if scores_work[i][1] > .75: if scores_work[i][1] > .7:
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}') print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
work_res[target] = self.works2aut[scores_work[i][0]] work_res[target] = self.works2aut[scores_work[i][0]]
break break

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

13
main.py
View File

@ -14,8 +14,11 @@ COMMENTARIES_PATH = './commentaries/'
DF_COMMENTARIES_PATH = './commentaries/data_parsed/' DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv') df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv') df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
df_rime = pd.read_csv(DF_COMMENTARIES_PATH + 'rime_DF.csv')
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv') df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
# TODO --> rime.xml
def pprint_com(comment, l=100): def pprint_com(comment, l=100):
i = 0 i = 0
@ -103,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
if labels is not None: if labels is not None:
query = comment query = comment
gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']] gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
print(f'{len(gold)} GOLD TARGETS ' + '-'*50) print(f'{len(gold)} GOLD TARGETS:')
for i in range(len(gold)): for i in range(len(gold)):
elem = gold.iloc[i] elem = gold.iloc[i]
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}') print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
@ -147,13 +150,13 @@ def connect_aut_work(list_aut, list_work, kb):
print('\n\nTODO') print('\n\nTODO')
# qid_list = [kb.aut2id[author] for author in list_aut] # qid_list = [kb.aut2id[author] for author in list_aut]
# wid_list = [kb.works2aut[work] for work in list_work] # wid_list = [kb.works2aut[work] for work in list_work]
# print('lel')
def main(): def main():
df_TRAIN = df_monarchia df_TRAIN = df_monarchia
df_eval = df_convivio df_eval = df_rime
dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique) dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json') TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
commentaries_convivio_eva = dataset.clean_commentaries_eva commentaries_convivio_eva = dataset.clean_commentaries_eva
commentaries_monarchia = dataset.clean_commentaries commentaries_monarchia = dataset.clean_commentaries
@ -163,9 +166,9 @@ def main():
nlp = spacy.load('./model_fastText/model_spacy_latest') nlp = spacy.load('./model_fastText/model_spacy_latest')
seed = random.randint(1, len(commentaries_convivio_eva)) seed = random.randint(1, len(commentaries_convivio_eva))
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval) preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia.json', extension=df_eval)
aut_res, work_res = kb.link_entities(preds, deepfuzz=True) aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
# Testing ------------------------ # Testing ------------------------