100 lines
3.2 KiB
Python
100 lines
3.2 KiB
Python
"""
|
|
Should also evaluate IF and HOW actual spaCy KB could be deoloyed in this scenario
|
|
|
|
https://github.com/seatgeek/fuzzywuzzy?source=post_page---------------------------
|
|
"""
|
|
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
|
|
|
|
from difflib import SequenceMatcher
|
|
from pprint import pprint
|
|
import pickle
|
|
|
|
|
|
class Knowledge_base:
|
|
|
|
def __init__(self, kb_path):
|
|
with open(kb_path, 'rb') as infile:
|
|
data = pickle.load(infile)
|
|
|
|
self.kb = data
|
|
#self.utt2ent = self._generate_utter_2_ent()
|
|
|
|
|
|
def link_entities(self, preds):
|
|
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER']
|
|
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
|
|
print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
|
|
# print(f'Candidates work:\n{WORK_preds}')
|
|
|
|
COMMEDIA_DATE = 1321
|
|
|
|
"""
|
|
for target in set(PER_preds):
|
|
if target in self.utt2ent.keys():
|
|
print(target, self.utt2ent[target])
|
|
"""
|
|
print('#'*50 + '\nChecking in KB...')
|
|
|
|
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'aquino)
|
|
|
|
for target in set(PER_preds):
|
|
scores = []
|
|
for auth in self.kb.keys():
|
|
sim = self._similar(target, auth)
|
|
scores.append((auth, sim))
|
|
|
|
scores.sort(key=lambda tup: tup[1], reverse=True)
|
|
for i in range(3):
|
|
if scores[i][1] > .8:
|
|
print(f'Prediction: {target} - {scores[i]} - born in {self.kb[scores[i][0]]["birth"]}')
|
|
break
|
|
#elif scores[0][1] == 0:
|
|
# print(f'Author {target} not in KB ')
|
|
|
|
return 0
|
|
|
|
"""
|
|
for target in set(PER_preds):
|
|
#print(f'TARGET: {target}')
|
|
|
|
scores = []
|
|
for auth in self.kb.keys():
|
|
sim = self._similar(target, auth)
|
|
scores.append((auth, sim))
|
|
|
|
scores.sort(key=lambda tup: tup[1], reverse=True)
|
|
# pprint(scores[:3])
|
|
|
|
all_lang_scores = self._check_other_lang(scores[0], target)
|
|
|
|
if all_lang_scores[0][1] >= 0.8: # with this threshold 'Tommaso' is not linked to 'Tommaso d'aquino' ...
|
|
print(f'TARGET: {target}')
|
|
print(f'{all_lang_scores[0][0]} was born in year: {self.kb[scores[0][0]]["birth"]}')
|
|
#print(all_lang_scores)
|
|
else:
|
|
continue
|
|
#print('Author not in KB')
|
|
print('-'*15)
|
|
|
|
"""
|
|
def _generate_utter_2_ent(self):
|
|
utt_2_ent = {}
|
|
for ent_en in self.kb.keys():
|
|
for utt in self.kb[ent_en]['names']:
|
|
utt_2_ent[utt] = ent_en
|
|
return utt_2_ent
|
|
|
|
|
|
def _check_other_lang(self, target, original_name):
|
|
other_names = self.kb[target[0]]['names']
|
|
|
|
scores = []
|
|
for name in other_names:
|
|
sim = self._similar(original_name, name)
|
|
scores.append((name, sim))
|
|
scores.sort(key=lambda tup: tup[1], reverse=True)
|
|
return scores
|
|
|
|
def _similar(self,a, b):
|
|
return SequenceMatcher(None, a, b).ratio()
|