eventExtractionHDN/entity_linker/knowledge_base.py

139 lines
6.0 KiB
Python

# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher
import json
import numpy as np
class KnowledgeBase:
def __init__(self, kb_path, extension=None):
with open(kb_path, 'rb') as infile:
data = json.load(infile)
self.id2aut = data
self.aut2id = {}
self.works2aut = {}
self._popolate_aut2id()
if extension is not None:
self._extend_kb(extension)
self._popolate_aut2id()
def link_entities(self, preds, deepfuzz=False):
COMMEDIA_DATE = 1321
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
print('\nMODEL RAW PREDICTIONS:')
print(f'Candidate authors (i.e., entities matched): {PER_preds}')
print(f'Candidates work :{WORK_preds}')
print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:')
"""
Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according
to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure
is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a
metric is computed beteen the original NER match and every author name present in the KB but also split on
the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as
simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned.
"""
aut_res = []
for target in set(PER_preds):
scores = []
deepscore = []
for auth in self.aut2id.keys():
sim = self._similar(target, auth)
scores.append((auth, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
success = False
for i in range(3):
if scores[i][1] > .8:
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
success = True
aut_res.append(target)
break
if deepfuzz and not success:
for aut in self.aut2id.keys():
_splitname = aut.split(' ')
sim = 0
for split in _splitname:
_sim = self._similar(target, split)
if _sim > sim:
sim = _sim
deepscore.append((aut, sim))
deepscore.sort(key=lambda tup: tup[1], reverse=True)
for j in range(3):
if deepscore[j][1] > .8:
print(
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
aut_res.append(target)
break
"""
Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary)
according to the edit distance computed across all of the titles in the KB. The similarity measure
is computed between the match and the exact name stored in the KB.
Once sorted, the first element satisfying the given threshold is returned.
"""
work_res = {}
if len(WORK_preds) != 0:
print('-' * 50 + '\n### Works matches:')
for target in set(WORK_preds):
scores_work = []
for work in self.works2aut.keys():
sim = self._similar(target, work)
scores_work.append((work, sim))
scores_work.sort(key=lambda tup: tup[1], reverse=True)
for i in range(3):
if scores_work[i][1] > .7:
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
work_res[target] = self.works2aut[scores_work[i][0]]
break
return aut_res, work_res
def _similar(self,a, b):
return SequenceMatcher(None, a, b).ratio()
def _popolate_aut2id(self):
for qid, values in self.id2aut.items():
if qid == 'null':
continue
if values is not None:
l_names = set(values['aut_name'].values())
for name in l_names:
self.aut2id[name] = qid
works = values['aut_works']
if len(works) != 0:
for wid, wvalues in works.items():
try:
self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
except:
continue
return self
def _extend_kb(self, df):
_qid = 0
prev_work = ''
for i in range(len(df)):
row = df.iloc[i]
auth = row.quot_author
work = row.quot_title
if auth is not np.nan and work is not np.nan:
if work != prev_work:
try:
qid = self.aut2id[auth]
new_wid = f'W{_qid}'
_qid += 1
self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
prev_work = work
except:
new_qid = f'Q{str(_qid)}'
new_wid = f'W{str(_qid)}'
_qid += 1
self.id2aut[new_qid] = {'aut_name': {'it': auth},
'aut_works': {new_wid: {'it': work}},
'aut_present_work': {},
'birth': 0}
prev_work = work
else:
continue