minor fixes and added some comments
This commit is contained in:
parent
6d65bfb03d
commit
c084babebd
|
|
@ -1,4 +0,0 @@
|
|||
si tratta di Bonifacio II (1150-1207) che fu uno dei capi della IV Crociata conclusasi con la presa
|
||||
di Costantinopoli (1204) e la creazione dell'Impero latino di Oriente (Bonifacio ebbe il regno di Te
|
||||
ssalonica). I trovatori provenzali a lui contemporanei esaltarono l'ospitalità generosa della sua co
|
||||
rte (Rambaut de Vaqueiras lo elogia paragonandolo proprio ad Alessandro Magno).
|
||||
|
|
@ -7,7 +7,7 @@ import json
|
|||
import time
|
||||
|
||||
|
||||
def testing_dbpedia(author):
|
||||
def query_dbpedia(author):
|
||||
endpoint = 'http://dbpedia.org/sparql'
|
||||
s = sparql.Service(endpoint, "utf-8", "SELECT")
|
||||
query_author = """SELECT ?names WHERE {{
|
||||
|
|
@ -23,7 +23,7 @@ def testing_dbpedia(author):
|
|||
return [result, results_works]
|
||||
|
||||
|
||||
def testing_wikidata(entity_q):
|
||||
def query_wikidata(entity_q):
|
||||
"""
|
||||
Notable work = P800
|
||||
Date of birth = P569
|
||||
|
|
@ -95,11 +95,11 @@ full_auth_list = author_uri_list_convivio + author_uri_list_monarchia + author_u
|
|||
dict_res = {}
|
||||
print(f'# Number of authors: {len(full_auth_list)}')
|
||||
for auth in tqdm.tqdm(full_auth_list):
|
||||
entity_q = testing_dbpedia(auth)[0]
|
||||
entity_q = query_dbpedia(auth)[0]
|
||||
wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
|
||||
dict_res[wikidata_endp] = None
|
||||
if wikidata_endp is not None:
|
||||
_, names, works, other_works, y_birth = testing_wikidata(wikidata_endp)
|
||||
_, names, works, other_works, y_birth = query_wikidata(wikidata_endp)
|
||||
dict_res[wikidata_endp] = {'aut_name': names,
|
||||
'aut_works': works,
|
||||
'aut_present_work': other_works,
|
||||
|
|
|
|||
|
|
@ -1,49 +0,0 @@
|
|||
import pickle
|
||||
from pprint import pprint
|
||||
"""
|
||||
with open('./KB_abs_convivio.pickle', 'rb') as infile:
|
||||
kb1 = pickle.load(infile)
|
||||
print(len(kb1))
|
||||
|
||||
with open('./KB_abs_monarchia.pickle', 'rb') as infile:
|
||||
kb2 = pickle.load(infile)
|
||||
print(len(kb2))
|
||||
|
||||
|
||||
def merge_dicts(iter_dict):
|
||||
merged = {}
|
||||
for i, dict in enumerate(iter_dict):
|
||||
if i == 0:
|
||||
merged = iter_dict[i]
|
||||
continue
|
||||
else:
|
||||
for k, v in iter_dict[i].items():
|
||||
if k not in merged.keys():
|
||||
merged[k] = v
|
||||
return merged
|
||||
|
||||
merged = merge_dicts([kb1, kb2])
|
||||
|
||||
with open('./KB_abs_merged.pickle', 'wb') as infile:
|
||||
pickle.dump(merged, infile)
|
||||
|
||||
from pprint import pprint
|
||||
pprint(merged['Giles_of_Rome'])
|
||||
"""
|
||||
with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
|
||||
kb = pickle.load(infile)
|
||||
|
||||
reversed_dict = {}
|
||||
for key in kb.keys():
|
||||
name_list = kb[key]['names']
|
||||
to_add = {'name':'None', 'birth':'None', 'abstract':'None'}
|
||||
for name in name_list:
|
||||
to_add['name'] = key
|
||||
to_add['birth'] = kb[key]['birth']
|
||||
to_add['abstract'] = kb[key]['abstract']
|
||||
reversed_dict[name] = to_add
|
||||
|
||||
print(len(reversed_dict))
|
||||
|
||||
with open('knowledge_base/KB_abs_reversed.pickle', 'wb') as outfile:
|
||||
pickle.dump(reversed_dict, outfile)
|
||||
|
|
@ -18,15 +18,22 @@ class KnowledgeBase:
|
|||
self._popolate_aut2id()
|
||||
|
||||
def link_entities(self, preds, deepfuzz=False):
|
||||
COMMEDIA_DATE = 1321
|
||||
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
|
||||
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
|
||||
# print('-'*50)
|
||||
# print(f'Candidate authors (i.e., entities matched): {PER_preds}')
|
||||
# print(f'Candidates work :{WORK_preds}')
|
||||
|
||||
COMMEDIA_DATE = 1321
|
||||
print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
|
||||
print('\nMODEL RAW PREDICTIONS:')
|
||||
print(f'Candidate authors (i.e., entities matched): {PER_preds}')
|
||||
print(f'Candidates work :{WORK_preds}')
|
||||
print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:')
|
||||
|
||||
"""
|
||||
Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according
|
||||
to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure
|
||||
is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a
|
||||
metric is computed beteen the original NER match and every author name present in the KB but also split on
|
||||
the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as
|
||||
simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned.
|
||||
"""
|
||||
aut_res = []
|
||||
for target in set(PER_preds):
|
||||
scores = []
|
||||
|
|
@ -40,7 +47,6 @@ class KnowledgeBase:
|
|||
for i in range(3):
|
||||
if scores[i][1] > .8:
|
||||
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
|
||||
#, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
|
||||
success = True
|
||||
aut_res.append(target)
|
||||
break
|
||||
|
|
@ -55,13 +61,18 @@ class KnowledgeBase:
|
|||
deepscore.append((aut, sim))
|
||||
deepscore.sort(key=lambda tup: tup[1], reverse=True)
|
||||
for j in range(3):
|
||||
if deepscore[j][1] > .9:
|
||||
if deepscore[j][1] > .8:
|
||||
print(
|
||||
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
|
||||
#, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
|
||||
aut_res.append(target)
|
||||
break
|
||||
|
||||
"""
|
||||
Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary)
|
||||
according to the edit distance computed across all of the titles in the KB. The similarity measure
|
||||
is computed between the match and the exact name stored in the KB.
|
||||
Once sorted, the first element satisfying the given threshold is returned.
|
||||
"""
|
||||
work_res = {}
|
||||
if len(WORK_preds) != 0:
|
||||
print('-' * 50 + '\n### Works matches:')
|
||||
|
|
@ -72,7 +83,7 @@ class KnowledgeBase:
|
|||
scores_work.append((work, sim))
|
||||
scores_work.sort(key=lambda tup: tup[1], reverse=True)
|
||||
for i in range(3):
|
||||
if scores_work[i][1] > .75:
|
||||
if scores_work[i][1] > .7:
|
||||
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
|
||||
work_res[target] = self.works2aut[scores_work[i][0]]
|
||||
break
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
13
main.py
13
main.py
|
|
@ -14,8 +14,11 @@ COMMENTARIES_PATH = './commentaries/'
|
|||
DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
|
||||
df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
|
||||
df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
|
||||
df_rime = pd.read_csv(DF_COMMENTARIES_PATH + 'rime_DF.csv')
|
||||
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
||||
|
||||
# TODO --> rime.xml
|
||||
|
||||
|
||||
def pprint_com(comment, l=100):
|
||||
i = 0
|
||||
|
|
@ -103,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
|
|||
if labels is not None:
|
||||
query = comment
|
||||
gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
|
||||
print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
|
||||
print(f'{len(gold)} GOLD TARGETS:')
|
||||
for i in range(len(gold)):
|
||||
elem = gold.iloc[i]
|
||||
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
|
||||
|
|
@ -147,13 +150,13 @@ def connect_aut_work(list_aut, list_work, kb):
|
|||
print('\n\nTODO')
|
||||
# qid_list = [kb.aut2id[author] for author in list_aut]
|
||||
# wid_list = [kb.works2aut[work] for work in list_work]
|
||||
# print('lel')
|
||||
|
||||
|
||||
def main():
|
||||
df_TRAIN = df_monarchia
|
||||
df_eval = df_convivio
|
||||
df_eval = df_rime
|
||||
dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
|
||||
|
||||
TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
|
||||
commentaries_convivio_eva = dataset.clean_commentaries_eva
|
||||
commentaries_monarchia = dataset.clean_commentaries
|
||||
|
|
@ -163,9 +166,9 @@ def main():
|
|||
nlp = spacy.load('./model_fastText/model_spacy_latest')
|
||||
|
||||
seed = random.randint(1, len(commentaries_convivio_eva))
|
||||
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
|
||||
|
||||
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
|
||||
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
|
||||
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia.json', extension=df_eval)
|
||||
aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
|
||||
|
||||
# Testing ------------------------
|
||||
|
|
|
|||
Loading…
Reference in New Issue