eventExtractionHDN/entity_linker/knowledge_base.py

# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher
import json


class KnowledgeBase:

    def __init__(self, kb_path):
        with open(kb_path, 'rb') as infile:
            data = json.load(infile)

        self.id2aut = data
        self.aut2id = {}
        self._popolate_aut2id()

    def link_entities(self, preds, deepfuzz=False):
        PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
        WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
        print('-'*50)
        print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
        # print(f'Candidates work:\n{WORK_preds}')

        COMMEDIA_DATE = 1321
        print('-'*50 + '\nChecking in KB...')

        # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)

        for target in set(PER_preds):
            scores = []
            deepscore = []
            for auth in self.aut2id.keys():
                sim = self._similar(target, auth)
                scores.append((auth, sim))

            scores.sort(key=lambda tup: tup[1], reverse=True)
            success = False
            for i in range(3):
                if scores[i][1] > .8:
                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
                    success = True
                    break
            if deepfuzz and not success:
                for aut in self.aut2id.keys():
                    _splitname = aut.split(' ')
                    sim = 0
                    for split in _splitname:
                        _sim = self._similar(target, split)
                        if _sim > sim:
                            sim = _sim
                        deepscore.append((aut, sim))
                deepscore.sort(key=lambda tup: tup[1], reverse=True)
                for j in range(3):
                    if deepscore[j][1] > .8:
                        print(
                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
                        break

        return 0

    def _generate_utter_2_ent(self):
        utt_2_ent = {}
        for ent_en in self.kb.keys():
            for utt in self.kb[ent_en]['names']:
                utt_2_ent[utt] = ent_en
        return utt_2_ent

    def _check_other_lang(self, target, original_name):
        other_names = self.kb[target[0]]['names']

        scores = []
        for name in other_names:
            sim = self._similar(original_name, name)
            scores.append((name, sim))
        scores.sort(key=lambda tup: tup[1], reverse=True)
        return scores

    def _similar(self,a, b):
        return SequenceMatcher(None, a, b).ratio()

    def _popolate_aut2id(self):
        for qid, values in self.id2aut.items():
            if values is not None:
                l_names = set(values['aut_name'].values())
                for name in l_names:
                    self.aut2id[name] = qid
        return self