eventExtractionHDN/entity_linker/knowledge_base.py

88 lines
3.2 KiB
Python

# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher
import json
class KnowledgeBase:
def __init__(self, kb_path):
with open(kb_path, 'rb') as infile:
data = json.load(infile)
self.id2aut = data
self.aut2id = {}
self._popolate_aut2id()
def link_entities(self, preds, deepfuzz=False):
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
print('-'*50)
print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
# print(f'Candidates work:\n{WORK_preds}')
COMMEDIA_DATE = 1321
print('-'*50 + '\nChecking in KB...')
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
for target in set(PER_preds):
scores = []
deepscore = []
for auth in self.aut2id.keys():
sim = self._similar(target, auth)
scores.append((auth, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
success = False
for i in range(3):
if scores[i][1] > .8:
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
success = True
break
if deepfuzz and not success:
for aut in self.aut2id.keys():
_splitname = aut.split(' ')
sim = 0
for split in _splitname:
_sim = self._similar(target, split)
if _sim > sim:
sim = _sim
deepscore.append((aut, sim))
deepscore.sort(key=lambda tup: tup[1], reverse=True)
for j in range(3):
if deepscore[j][1] > .8:
print(
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
break
return 0
def _generate_utter_2_ent(self):
utt_2_ent = {}
for ent_en in self.kb.keys():
for utt in self.kb[ent_en]['names']:
utt_2_ent[utt] = ent_en
return utt_2_ent
def _check_other_lang(self, target, original_name):
other_names = self.kb[target[0]]['names']
scores = []
for name in other_names:
sim = self._similar(original_name, name)
scores.append((name, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
return scores
def _similar(self,a, b):
return SequenceMatcher(None, a, b).ratio()
def _popolate_aut2id(self):
for qid, values in self.id2aut.items():
if values is not None:
l_names = set(values['aut_name'].values())
for name in l_names:
self.aut2id[name] = qid
return self