eventExtractionHDN/entity_linker/knowledge_base.py

100 lines
3.2 KiB
Python

"""
Should also evaluate IF and HOW actual spaCy KB could be deoloyed in this scenario
https://github.com/seatgeek/fuzzywuzzy?source=post_page---------------------------
"""
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher
from pprint import pprint
import pickle
class Knowledge_base:
def __init__(self, kb_path):
with open(kb_path, 'rb') as infile:
data = pickle.load(infile)
self.kb = data
#self.utt2ent = self._generate_utter_2_ent()
def link_entities(self, preds):
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
# print(f'Candidates work:\n{WORK_preds}')
COMMEDIA_DATE = 1321
"""
for target in set(PER_preds):
if target in self.utt2ent.keys():
print(target, self.utt2ent[target])
"""
print('#'*50 + '\nChecking in KB...')
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'aquino)
for target in set(PER_preds):
scores = []
for auth in self.kb.keys():
sim = self._similar(target, auth)
scores.append((auth, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
for i in range(3):
if scores[i][1] > .8:
print(f'Prediction: {target} - {scores[i]} - born in {self.kb[scores[i][0]]["birth"]}')
break
#elif scores[0][1] == 0:
# print(f'Author {target} not in KB ')
return 0
"""
for target in set(PER_preds):
#print(f'TARGET: {target}')
scores = []
for auth in self.kb.keys():
sim = self._similar(target, auth)
scores.append((auth, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
# pprint(scores[:3])
all_lang_scores = self._check_other_lang(scores[0], target)
if all_lang_scores[0][1] >= 0.8: # with this threshold 'Tommaso' is not linked to 'Tommaso d'aquino' ...
print(f'TARGET: {target}')
print(f'{all_lang_scores[0][0]} was born in year: {self.kb[scores[0][0]]["birth"]}')
#print(all_lang_scores)
else:
continue
#print('Author not in KB')
print('-'*15)
"""
def _generate_utter_2_ent(self):
utt_2_ent = {}
for ent_en in self.kb.keys():
for utt in self.kb[ent_en]['names']:
utt_2_ent[utt] = ent_en
return utt_2_ent
def _check_other_lang(self, target, original_name):
other_names = self.kb[target[0]]['names']
scores = []
for name in other_names:
sim = self._similar(original_name, name)
scores.append((name, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
return scores
def _similar(self,a, b):
return SequenceMatcher(None, a, b).ratio()