eventExtractionHDN/entity_linker/knowledge_base.py

128 lines
5.0 KiB
Python

# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher
import json
import numpy as np
class KnowledgeBase:
def __init__(self, kb_path, extension=None):
with open(kb_path, 'rb') as infile:
data = json.load(infile)
self.id2aut = data
self.aut2id = {}
self.works2aut = {}
self._popolate_aut2id()
if extension is not None:
self._extend_kb(extension)
self._popolate_aut2id()
def link_entities(self, preds, deepfuzz=False):
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
# print('-'*50)
# print(f'Candidate authors (i.e., entities matched): {PER_preds}')
# print(f'Candidates work :{WORK_preds}')
COMMEDIA_DATE = 1321
print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
aut_res = []
for target in set(PER_preds):
scores = []
deepscore = []
for auth in self.aut2id.keys():
sim = self._similar(target, auth)
scores.append((auth, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
success = False
for i in range(3):
if scores[i][1] > .8:
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
#, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
success = True
aut_res.append(target)
break
if deepfuzz and not success:
for aut in self.aut2id.keys():
_splitname = aut.split(' ')
sim = 0
for split in _splitname:
_sim = self._similar(target, split)
if _sim > sim:
sim = _sim
deepscore.append((aut, sim))
deepscore.sort(key=lambda tup: tup[1], reverse=True)
for j in range(3):
if deepscore[j][1] > .9:
print(
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
#, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
aut_res.append(target)
break
work_res = {}
if len(WORK_preds) != 0:
print('-' * 50 + '\n### Works matches:')
for target in set(WORK_preds):
scores_work = []
for work in self.works2aut.keys():
sim = self._similar(target, work)
scores_work.append((work, sim))
scores_work.sort(key=lambda tup: tup[1], reverse=True)
for i in range(3):
if scores_work[i][1] > .75:
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
work_res[target] = self.works2aut[scores_work[i][0]]
break
return aut_res, work_res
def _similar(self,a, b):
return SequenceMatcher(None, a, b).ratio()
def _popolate_aut2id(self):
for qid, values in self.id2aut.items():
if qid == 'null':
continue
if values is not None:
l_names = set(values['aut_name'].values())
for name in l_names:
self.aut2id[name] = qid
works = values['aut_works']
if len(works) != 0:
for wid, wvalues in works.items():
try:
self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
except:
continue
return self
def _extend_kb(self, df):
_qid = 0
prev_work = ''
for i in range(len(df)):
row = df.iloc[i]
auth = row.quot_author
work = row.quot_title
if auth is not np.nan and work is not np.nan:
if work != prev_work:
try:
qid = self.aut2id[auth]
new_wid = f'W{_qid}'
_qid += 1
self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
prev_work = work
except:
new_qid = f'Q{str(_qid)}'
new_wid = f'W{str(_qid)}'
_qid += 1
self.id2aut[new_qid] = {'aut_name': {'it': auth},
'aut_works': {new_wid: {'it': work}},
'aut_present_work': {},
'birth': 0}
prev_work = work
else:
continue