eventExtractionHDN/entity_linker/knowledge_base.py

# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher
import json
import numpy as np


class KnowledgeBase:
    def __init__(self, kb_path, extension=None):
        with open(kb_path, 'rb') as infile:
            data = json.load(infile)

        self.id2aut = data
        self.aut2id = {}
        self.works2aut = {}
        self._popolate_aut2id()
        if extension is not None:
            self._extend_kb(extension)
            self._popolate_aut2id()

    def link_entities(self, preds, deepfuzz=False):
        COMMEDIA_DATE = 1321
        PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
        WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
        print('\nMODEL RAW PREDICTIONS:')
        print(f'Candidate authors (i.e., entities matched): {PER_preds}')
        print(f'Candidates work :{WORK_preds}')
        print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:')

        """
        Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according
        to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure
        is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a
        metric is computed beteen the original NER match and every author name present in the KB but also split on
        the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as
        simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned.
        """
        aut_res = []
        for target in set(PER_preds):
            scores = []
            deepscore = []
            for auth in self.aut2id.keys():
                sim = self._similar(target, auth)
                scores.append((auth, sim))

            scores.sort(key=lambda tup: tup[1], reverse=True)
            success = False
            for i in range(3):
                if scores[i][1] > .8:
                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
                    success = True
                    aut_res.append(target)
                    break
            if deepfuzz and not success:
                for aut in self.aut2id.keys():
                    _splitname = aut.split(' ')
                    sim = 0
                    for split in _splitname:
                        _sim = self._similar(target, split)
                        if _sim > sim:
                            sim = _sim
                        deepscore.append((aut, sim))
                deepscore.sort(key=lambda tup: tup[1], reverse=True)
                for j in range(3):
                    if deepscore[j][1] > .8:
                        print(
                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
                        aut_res.append(target)
                        break

        """
        Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary)
        according to the edit distance computed across all of the titles in the KB. The similarity measure
        is computed between the match and the exact name stored in the KB.
        Once sorted, the first element satisfying the given threshold is returned.
        """
        work_res = {}
        if len(WORK_preds) != 0:
            print('-' * 50 + '\n### Works matches:')
            for target in set(WORK_preds):
                scores_work = []
                for work in self.works2aut.keys():
                    sim = self._similar(target, work)
                    scores_work.append((work, sim))
                scores_work.sort(key=lambda tup: tup[1], reverse=True)
                for i in range(3):
                    if scores_work[i][1] > .7:
                        print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
                        work_res[target] = self.works2aut[scores_work[i][0]]
                        break

        return aut_res, work_res

    def _similar(self,a, b):
        return SequenceMatcher(None, a, b).ratio()

    def _popolate_aut2id(self):
        for qid, values in self.id2aut.items():
            if qid == 'null':
                continue
            if values is not None:
                l_names = set(values['aut_name'].values())
                for name in l_names:
                    self.aut2id[name] = qid
            works = values['aut_works']
            if len(works) != 0:
                for wid, wvalues in works.items():
                    try:
                        self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
                    except:
                        continue

        return self

    def _extend_kb(self, df):
        _qid = 0
        prev_work = ''
        for i in range(len(df)):
            row = df.iloc[i]
            auth = row.quot_author
            work = row.quot_title
            if auth is not np.nan and work is not np.nan:
                if work != prev_work:
                    try:
                        qid = self.aut2id[auth]
                        new_wid = f'W{_qid}'
                        _qid += 1
                        self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
                        prev_work = work
                    except:
                        new_qid = f'Q{str(_qid)}'
                        new_wid = f'W{str(_qid)}'
                        _qid += 1
                        self.id2aut[new_qid] = {'aut_name': {'it': auth},
                                                'aut_works': {new_wid: {'it': work}},
                                                'aut_present_work': {},
                                                'birth': 0}
                        prev_work = work
                else:
                    continue