minor fixes and added some comments

This commit is contained in:
andrea 2020-10-28 09:52:45 +01:00
parent 6d65bfb03d
commit c084babebd
11 changed files with 34 additions and 76 deletions

View File

@ -1,4 +0,0 @@
si tratta di Bonifacio II (1150-1207) che fu uno dei capi della IV Crociata conclusasi con la presa
di Costantinopoli (1204) e la creazione dell'Impero latino di Oriente (Bonifacio ebbe il regno di Te
ssalonica). I trovatori provenzali a lui contemporanei esaltarono l'ospitalità generosa della sua co
rte (Rambaut de Vaqueiras lo elogia paragonandolo proprio ad Alessandro Magno).

View File

@ -7,7 +7,7 @@ import json
import time
def testing_dbpedia(author):
def query_dbpedia(author):
endpoint = 'http://dbpedia.org/sparql'
s = sparql.Service(endpoint, "utf-8", "SELECT")
query_author = """SELECT ?names WHERE {{
@ -23,7 +23,7 @@ def testing_dbpedia(author):
return [result, results_works]
def testing_wikidata(entity_q):
def query_wikidata(entity_q):
"""
Notable work = P800
Date of birth = P569
@ -95,11 +95,11 @@ full_auth_list = author_uri_list_convivio + author_uri_list_monarchia + author_u
dict_res = {}
print(f'# Number of authors: {len(full_auth_list)}')
for auth in tqdm.tqdm(full_auth_list):
entity_q = testing_dbpedia(auth)[0]
entity_q = query_dbpedia(auth)[0]
wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
dict_res[wikidata_endp] = None
if wikidata_endp is not None:
_, names, works, other_works, y_birth = testing_wikidata(wikidata_endp)
_, names, works, other_works, y_birth = query_wikidata(wikidata_endp)
dict_res[wikidata_endp] = {'aut_name': names,
'aut_works': works,
'aut_present_work': other_works,

View File

@ -1,49 +0,0 @@
import pickle
from pprint import pprint
"""
with open('./KB_abs_convivio.pickle', 'rb') as infile:
kb1 = pickle.load(infile)
print(len(kb1))
with open('./KB_abs_monarchia.pickle', 'rb') as infile:
kb2 = pickle.load(infile)
print(len(kb2))
def merge_dicts(iter_dict):
merged = {}
for i, dict in enumerate(iter_dict):
if i == 0:
merged = iter_dict[i]
continue
else:
for k, v in iter_dict[i].items():
if k not in merged.keys():
merged[k] = v
return merged
merged = merge_dicts([kb1, kb2])
with open('./KB_abs_merged.pickle', 'wb') as infile:
pickle.dump(merged, infile)
from pprint import pprint
pprint(merged['Giles_of_Rome'])
"""
with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
kb = pickle.load(infile)
reversed_dict = {}
for key in kb.keys():
name_list = kb[key]['names']
to_add = {'name':'None', 'birth':'None', 'abstract':'None'}
for name in name_list:
to_add['name'] = key
to_add['birth'] = kb[key]['birth']
to_add['abstract'] = kb[key]['abstract']
reversed_dict[name] = to_add
print(len(reversed_dict))
with open('knowledge_base/KB_abs_reversed.pickle', 'wb') as outfile:
pickle.dump(reversed_dict, outfile)

View File

@ -18,15 +18,22 @@ class KnowledgeBase:
self._popolate_aut2id()
def link_entities(self, preds, deepfuzz=False):
COMMEDIA_DATE = 1321
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
# print('-'*50)
# print(f'Candidate authors (i.e., entities matched): {PER_preds}')
# print(f'Candidates work :{WORK_preds}')
COMMEDIA_DATE = 1321
print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
print('\nMODEL RAW PREDICTIONS:')
print(f'Candidate authors (i.e., entities matched): {PER_preds}')
print(f'Candidates work :{WORK_preds}')
print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:')
"""
Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according
to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure
is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a
metric is computed beteen the original NER match and every author name present in the KB but also split on
the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as
simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned.
"""
aut_res = []
for target in set(PER_preds):
scores = []
@ -40,7 +47,6 @@ class KnowledgeBase:
for i in range(3):
if scores[i][1] > .8:
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
#, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
success = True
aut_res.append(target)
break
@ -55,13 +61,18 @@ class KnowledgeBase:
deepscore.append((aut, sim))
deepscore.sort(key=lambda tup: tup[1], reverse=True)
for j in range(3):
if deepscore[j][1] > .9:
if deepscore[j][1] > .8:
print(
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
#, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
aut_res.append(target)
break
"""
Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary)
according to the edit distance computed across all of the titles in the KB. The similarity measure
is computed between the match and the exact name stored in the KB.
Once sorted, the first element satisfying the given threshold is returned.
"""
work_res = {}
if len(WORK_preds) != 0:
print('-' * 50 + '\n### Works matches:')
@ -72,7 +83,7 @@ class KnowledgeBase:
scores_work.append((work, sim))
scores_work.sort(key=lambda tup: tup[1], reverse=True)
for i in range(3):
if scores_work[i][1] > .75:
if scores_work[i][1] > .7:
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
work_res[target] = self.works2aut[scores_work[i][0]]
break

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

13
main.py
View File

@ -14,8 +14,11 @@ COMMENTARIES_PATH = './commentaries/'
DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
df_rime = pd.read_csv(DF_COMMENTARIES_PATH + 'rime_DF.csv')
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
# TODO --> rime.xml
def pprint_com(comment, l=100):
i = 0
@ -103,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
if labels is not None:
query = comment
gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
print(f'{len(gold)} GOLD TARGETS:')
for i in range(len(gold)):
elem = gold.iloc[i]
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
@ -147,13 +150,13 @@ def connect_aut_work(list_aut, list_work, kb):
print('\n\nTODO')
# qid_list = [kb.aut2id[author] for author in list_aut]
# wid_list = [kb.works2aut[work] for work in list_work]
# print('lel')
def main():
df_TRAIN = df_monarchia
df_eval = df_convivio
df_eval = df_rime
dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
commentaries_convivio_eva = dataset.clean_commentaries_eva
commentaries_monarchia = dataset.clean_commentaries
@ -163,9 +166,9 @@ def main():
nlp = spacy.load('./model_fastText/model_spacy_latest')
seed = random.randint(1, len(commentaries_convivio_eva))
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia.json', extension=df_eval)
aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
# Testing ------------------------