# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz from difflib import SequenceMatcher import json import numpy as np class KnowledgeBase: def __init__(self, kb_path, extension=None): with open(kb_path, 'rb') as infile: data = json.load(infile) self.id2aut = data self.aut2id = {} self.works2aut = {} self._popolate_aut2id() if extension is not None: self._extend_kb(extension) self._popolate_aut2id() def link_entities(self, preds, deepfuzz=False): PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] # print('-'*50) # print(f'Candidate authors (i.e., entities matched): {PER_preds}') # print(f'Candidates work :{WORK_preds}') COMMEDIA_DATE = 1321 print('-'*50 + '\n\nOUTPUT:\n### Author matches:') aut_res = [] for target in set(PER_preds): scores = [] deepscore = [] for auth in self.aut2id.keys(): sim = self._similar(target, auth) scores.append((auth, sim)) scores.sort(key=lambda tup: tup[1], reverse=True) success = False for i in range(3): if scores[i][1] > .8: print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}') #, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}') success = True aut_res.append(target) break if deepfuzz and not success: for aut in self.aut2id.keys(): _splitname = aut.split(' ') sim = 0 for split in _splitname: _sim = self._similar(target, split) if _sim > sim: sim = _sim deepscore.append((aut, sim)) deepscore.sort(key=lambda tup: tup[1], reverse=True) for j in range(3): if deepscore[j][1] > .9: print( f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}') #, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}') aut_res.append(target) break work_res = {} if len(WORK_preds) != 0: print('-' * 50 + '\n### Works matches:') for target in set(WORK_preds): scores_work = [] for work in self.works2aut.keys(): sim = self._similar(target, work) scores_work.append((work, sim)) scores_work.sort(key=lambda tup: tup[1], reverse=True) for i in range(3): if scores_work[i][1] > .75: print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}') work_res[target] = self.works2aut[scores_work[i][0]] break return aut_res, work_res def _similar(self,a, b): return SequenceMatcher(None, a, b).ratio() def _popolate_aut2id(self): for qid, values in self.id2aut.items(): if qid == 'null': continue if values is not None: l_names = set(values['aut_name'].values()) for name in l_names: self.aut2id[name] = qid works = values['aut_works'] if len(works) != 0: for wid, wvalues in works.items(): try: self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it'] except: continue return self def _extend_kb(self, df): _qid = 0 prev_work = '' for i in range(len(df)): row = df.iloc[i] auth = row.quot_author work = row.quot_title if auth is not np.nan and work is not np.nan: if work != prev_work: try: qid = self.aut2id[auth] new_wid = f'W{_qid}' _qid += 1 self.id2aut[qid]['aut_works'][new_wid] = {'it': work} prev_work = work except: new_qid = f'Q{str(_qid)}' new_wid = f'W{str(_qid)}' _qid += 1 self.id2aut[new_qid] = {'aut_name': {'it': auth}, 'aut_works': {new_wid: {'it': work}}, 'aut_present_work': {}, 'birth': 0} prev_work = work else: continue