# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz from difflib import SequenceMatcher import json import numpy as np class KnowledgeBase: def __init__(self, kb_path, extension=None): with open(kb_path, 'rb') as infile: data = json.load(infile) self.id2aut = data self.aut2id = {} self.works2aut = {} self._popolate_aut2id() if extension is not None: self._extend_kb(extension) self._popolate_aut2id() def link_entities(self, preds, deepfuzz=False): COMMEDIA_DATE = 1321 PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] print('\nMODEL RAW PREDICTIONS:') print(f'Candidate authors (i.e., entities matched): {PER_preds}') print(f'Candidates work :{WORK_preds}') print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:') """ Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a metric is computed beteen the original NER match and every author name present in the KB but also split on the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned. """ aut_res = [] for target in set(PER_preds): scores = [] deepscore = [] for auth in self.aut2id.keys(): sim = self._similar(target, auth) scores.append((auth, sim)) scores.sort(key=lambda tup: tup[1], reverse=True) success = False for i in range(3): if scores[i][1] > .8: print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}') success = True aut_res.append(target) break if deepfuzz and not success: for aut in self.aut2id.keys(): _splitname = aut.split(' ') sim = 0 for split in _splitname: _sim = self._similar(target, split) if _sim > sim: sim = _sim deepscore.append((aut, sim)) deepscore.sort(key=lambda tup: tup[1], reverse=True) for j in range(3): if deepscore[j][1] > .8: print( f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}') aut_res.append(target) break """ Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary) according to the edit distance computed across all of the titles in the KB. The similarity measure is computed between the match and the exact name stored in the KB. Once sorted, the first element satisfying the given threshold is returned. """ work_res = {} if len(WORK_preds) != 0: print('-' * 50 + '\n### Works matches:') for target in set(WORK_preds): scores_work = [] for work in self.works2aut.keys(): sim = self._similar(target, work) scores_work.append((work, sim)) scores_work.sort(key=lambda tup: tup[1], reverse=True) for i in range(3): if scores_work[i][1] > .7: print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}') work_res[target] = self.works2aut[scores_work[i][0]] break return aut_res, work_res def _similar(self,a, b): return SequenceMatcher(None, a, b).ratio() def _popolate_aut2id(self): for qid, values in self.id2aut.items(): if qid == 'null': continue if values is not None: l_names = set(values['aut_name'].values()) for name in l_names: self.aut2id[name] = qid works = values['aut_works'] if len(works) != 0: for wid, wvalues in works.items(): try: self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it'] except: continue return self def _extend_kb(self, df): _qid = 0 prev_work = '' for i in range(len(df)): row = df.iloc[i] auth = row.quot_author work = row.quot_title if auth is not np.nan and work is not np.nan: if work != prev_work: try: qid = self.aut2id[auth] new_wid = f'W{_qid}' _qid += 1 self.id2aut[qid]['aut_works'][new_wid] = {'it': work} prev_work = work except: new_qid = f'Q{str(_qid)}' new_wid = f'W{str(_qid)}' _qid += 1 self.id2aut[new_qid] = {'aut_name': {'it': auth}, 'aut_works': {new_wid: {'it': work}}, 'aut_present_work': {}, 'birth': 0} prev_work = work else: continue