eventExtractionHDN/entity_linker/knowledge_base.py

"""
Should also evaluate IF and HOW actual spaCy KB could be deoloyed in this scenario

https://github.com/seatgeek/fuzzywuzzy?source=post_page---------------------------
"""
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz

from difflib import SequenceMatcher
from pprint import pprint
import pickle


class Knowledge_base:

    def __init__(self, kb_path):
        with open(kb_path, 'rb') as infile:
            data = pickle.load(infile)

        self.kb = data
        #self.utt2ent = self._generate_utter_2_ent()


    def link_entities(self, preds):
        PER_preds = [pred[0] for pred in preds if pred[1] == 'PER']
        WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
        print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
        # print(f'Candidates work:\n{WORK_preds}')

        COMMEDIA_DATE = 1321

        """
        for target in set(PER_preds):
            if target in self.utt2ent.keys():
                print(target, self.utt2ent[target])
        """
        print('#'*50 + '\nChecking in KB...')

        # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'aquino)

        for target in set(PER_preds):
            scores = []
            for auth in self.kb.keys():
                sim = self._similar(target, auth)
                scores.append((auth, sim))

            scores.sort(key=lambda tup: tup[1], reverse=True)
            for i in range(3):
                if scores[i][1] > .8:
                    print(f'Prediction: {target} - {scores[i]} - born in {self.kb[scores[i][0]]["birth"]}')
                    break
            #elif scores[0][1] == 0:
            #    print(f'Author {target} not in KB ')

        return 0

        """
        for target in set(PER_preds):
            #print(f'TARGET: {target}')

            scores = []
            for auth in self.kb.keys():
                sim = self._similar(target, auth)
                scores.append((auth, sim))

            scores.sort(key=lambda tup: tup[1], reverse=True)
            # pprint(scores[:3])

            all_lang_scores = self._check_other_lang(scores[0], target)

            if all_lang_scores[0][1] >= 0.8:    # with this threshold 'Tommaso' is not linked to 'Tommaso d'aquino' ...
                print(f'TARGET: {target}')
                print(f'{all_lang_scores[0][0]} was born in year: {self.kb[scores[0][0]]["birth"]}')
                #print(all_lang_scores)
            else:
                continue
                #print('Author not in KB')
            print('-'*15)

        """
    def _generate_utter_2_ent(self):
        utt_2_ent = {}
        for ent_en in self.kb.keys():
            for utt in self.kb[ent_en]['names']:
                utt_2_ent[utt] = ent_en
        return utt_2_ent


    def _check_other_lang(self, target, original_name):
        other_names = self.kb[target[0]]['names']

        scores = []
        for name in other_names:
            sim = self._similar(original_name, name)
            scores.append((name, sim))
        scores.sort(key=lambda tup: tup[1], reverse=True)
        return scores

    def _similar(self,a, b):
        return SequenceMatcher(None, a, b).ratio()