# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz from difflib import SequenceMatcher import json class KnowledgeBase: def __init__(self, kb_path): with open(kb_path, 'rb') as infile: data = json.load(infile) self.id2aut = data self.aut2id = {} self._popolate_aut2id() def link_entities(self, preds, deepfuzz=False): PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] print('-'*50) print(f'Candidate authors (i.e., entitites matched): {PER_preds}') # print(f'Candidates work:\n{WORK_preds}') COMMEDIA_DATE = 1321 print('-'*50 + '\nChecking in KB...') # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino) for target in set(PER_preds): scores = [] deepscore = [] for auth in self.aut2id.keys(): sim = self._similar(target, auth) scores.append((auth, sim)) scores.sort(key=lambda tup: tup[1], reverse=True) success = False for i in range(3): if scores[i][1] > .8: print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}') success = True break if deepfuzz and not success: for aut in self.aut2id.keys(): _splitname = aut.split(' ') sim = 0 for split in _splitname: _sim = self._similar(target, split) if _sim > sim: sim = _sim deepscore.append((aut, sim)) deepscore.sort(key=lambda tup: tup[1], reverse=True) for j in range(3): if deepscore[j][1] > .8: print( f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}') break return 0 def _generate_utter_2_ent(self): utt_2_ent = {} for ent_en in self.kb.keys(): for utt in self.kb[ent_en]['names']: utt_2_ent[utt] = ent_en return utt_2_ent def _check_other_lang(self, target, original_name): other_names = self.kb[target[0]]['names'] scores = [] for name in other_names: sim = self._similar(original_name, name) scores.append((name, sim)) scores.sort(key=lambda tup: tup[1], reverse=True) return scores def _similar(self,a, b): return SequenceMatcher(None, a, b).ratio() def _popolate_aut2id(self): for qid, values in self.id2aut.items(): if values is not None: l_names = set(values['aut_name'].values()) for name in l_names: self.aut2id[name] = qid return self