function to extend KB with data from annotated commentaries

2020-10-23 16:59:01 +02:00 · 2020-10-23 16:59:01 +02:00 · 2324ddff9f
parent fb84b36b90
commit 2324ddff9f
5 changed files with 117 additions and 103 deletions
--- a/entity_linker/knowledge_base.py
+++ b/entity_linker/knowledge_base.py
@ -1,30 +1,33 @@
 # TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
 from difflib import SequenceMatcher
 import json
+import numpy as np


 class KnowledgeBase:
-
-    def __init__(self, kb_path):
+    def __init__(self, kb_path, extension=None):
        with open(kb_path, 'rb') as infile:
            data = json.load(infile)

        self.id2aut = data
        self.aut2id = {}
+        self.works2aut = {}
        self._popolate_aut2id()
+        if extension is not None:
+            self._extend_kb(extension)
+            self._popolate_aut2id()

    def link_entities(self, preds, deepfuzz=False):
        PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
        WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
-        print('-'*50)
-        print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
-        # print(f'Candidates work:\n{WORK_preds}')
+        # print('-'*50)
+        # print(f'Candidate authors (i.e., entities matched): {PER_preds}')
+        # print(f'Candidates work :{WORK_preds}')

        COMMEDIA_DATE = 1321
-        print('-'*50 + '\nChecking in KB...')
-
-        # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
+        print('-'*50 + '\n\nOUTPUT:\n### Author matches:')

+        aut_res = []
        for target in set(PER_preds):
            scores = []
            deepscore = []
@ -36,8 +39,10 @@ class KnowledgeBase:
            success = False
            for i in range(3):
                if scores[i][1] > .8:
-                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
+                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
+                    #, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
                    success = True
+                    aut_res.append(target)
                    break
            if deepfuzz and not success:
                for aut in self.aut2id.keys():
@ -50,38 +55,74 @@ class KnowledgeBase:
                        deepscore.append((aut, sim))
                deepscore.sort(key=lambda tup: tup[1], reverse=True)
                for j in range(3):
-                    if deepscore[j][1] > .8:
+                    if deepscore[j][1] > .9:
                        print(
-                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
+                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
+                        #, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
+                        aut_res.append(target)
                        break
-            
-        return 0

-    def _generate_utter_2_ent(self):
-        utt_2_ent = {}
-        for ent_en in self.kb.keys():
-            for utt in self.kb[ent_en]['names']:
-                utt_2_ent[utt] = ent_en
-        return utt_2_ent
+        work_res = {}
+        if len(WORK_preds) != 0:
+            print('-' * 50 + '\n### Works matches:')
+            for target in set(WORK_preds):
+                scores_work = []
+                for work in self.works2aut.keys():
+                    sim = self._similar(target, work)
+                    scores_work.append((work, sim))
+                scores_work.sort(key=lambda tup: tup[1], reverse=True)
+                for i in range(3):
+                    if scores_work[i][1] > .75:
+                        print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
+                        work_res[target] = self.works2aut[scores_work[i][0]]
+                        break

-    def _check_other_lang(self, target, original_name):
-        other_names = self.kb[target[0]]['names']
-
-        scores = []
-        for name in other_names:
-            sim = self._similar(original_name, name)
-            scores.append((name, sim))
-        scores.sort(key=lambda tup: tup[1], reverse=True)
-        return scores
+        return aut_res, work_res

    def _similar(self,a, b):
        return SequenceMatcher(None, a, b).ratio()

    def _popolate_aut2id(self):
        for qid, values in self.id2aut.items():
+            if qid == 'null':
+                continue
            if values is not None:
                l_names = set(values['aut_name'].values())
                for name in l_names:
                    self.aut2id[name] = qid
+            works = values['aut_works']
+            if len(works) != 0:
+                for wid, wvalues in works.items():
+                    try:
+                        self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
+                    except:
+                        continue
+
        return self

+    def _extend_kb(self, df):
+        _qid = 0
+        prev_work = ''
+        for i in range(len(df)):
+            row = df.iloc[i]
+            auth = row.quot_author
+            work = row.quot_title
+            if auth is not np.nan and work is not np.nan:
+                if work != prev_work:
+                    try:
+                        qid = self.aut2id[auth]
+                        new_wid = f'W{_qid}'
+                        _qid += 1
+                        self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
+                        prev_work = work
+                    except:
+                        new_qid = f'Q{str(_qid)}'
+                        new_wid = f'W{str(_qid)}'
+                        _qid += 1
+                        self.id2aut[new_qid] = {'aut_name': {'it': auth},
+                                                'aut_works': {new_wid: {'it': work}},
+                                                'aut_present_work': {},
+                                                'birth': 0}
+                        prev_work = work
+                else:
+                    continue
--- a/entity_linker/knowledge_base/KB_abs_merged.pickle
+++ b/entity_linker/knowledge_base/KB_abs_merged.pickle
--- a/entity_linker/knowledge_base/KB_abs_reversed.pickle
+++ b/entity_linker/knowledge_base/KB_abs_reversed.pickle
--- a/main.py
+++ b/main.py
@ -5,9 +5,8 @@ from spacy.util import minibatch, compounding
 import warnings
 from preprocessing.ner_dataset_builder import DataSetBuilder
 from entity_linker.knowledge_base import KnowledgeBase
-from tqdm import tqdm
+from tqdm import tqdm, trange
 from pathlib import Path
-import pickle
 import random


@ -18,7 +17,16 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
 df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')


-def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
+def pprint_com(comment, l=100):
+    i = 0
+    while len(comment) > i + 100:
+        j = i + l
+        print(comment[i:j])
+        i += 100
+    print(comment[i:len(comment)])
+
+
+def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'):

        model = spacy.load(SPACY_MODEL_STD)
        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -36,35 +44,22 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL

        with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
            print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
-            # warnings.filterwarnings("once", category=UserWarning, module='spacy')
-            
            optimizer = model.resume_training()
-
-            n_epochs = 7
-            #batch_size = 32
+            n_epochs = nepochs
            print(f'\n## Begin Training')
-            for i in tqdm(range(n_epochs), desc='Iter'):
-                #print(f'Iteration {i+1}')
+            t = trange(n_epochs, desc='Iter')
+            for i in t:
                losses = {}
                random.shuffle(TRAIN_DATA)
                batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    docs, golds = zip(*batch)
                    model.update(docs, golds, sgd=optimizer, losses=losses)
+                t.set_description(f'NER loss: {round(losses["ner"], 5)}')
            print(f'Final loss: {losses}')

        seed = random.randint(1, len(clean_commentaries))
-
-        def pprint_com(comment, l=100):
-            i = 0
-            while len(comment) > i+100:
-                j = i+l
-                print(comment[i:j])
-                i += 100
-            print(comment[i:len(comment)])
-
        disabled.restore()
-
        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
    
        eg_eval = df_eval.iloc[seed]
@ -80,21 +75,11 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
        for ent in doc.ents:
          print(ent.text, ent.label_)

-        """
-        print('\n')
-        print('-'*50)
-        print('STANDARD NER MODEL PREDICTIONS:')
-        nlp_reloaded = spacy.load('it_core_news_sm')
-        doc_STD = nlp_reloaded(clean_comment)
-        for ent in doc_STD.ents:
-            print(ent.text, ent.label_)
-        """
-
        print('\n')
        print('-'*50)
        print('GOLD:')
        query = eg_eval['comment']
-        gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
+        gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']]
        print(gold)

        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -104,15 +89,6 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL


 def predict_candidates(model, comment, labels=None):
-    
-    def pprint_com(comment, l=100):
-               i = 0
-               while len(comment) > i+100:
-                   j = i+l
-                   print(comment[i:j])
-                   i += 100
-               print(comment[i:len(comment)])
-    
    clean_comment = comment.replace('<i>', '')
    clean_comment = clean_comment.replace('</i>', '')
    clean_comment = clean_comment.replace('\\', '')
@ -123,7 +99,6 @@ def predict_candidates(model, comment, labels=None):

    print('\n')               
    candidates = [(ent.text, ent.label_) for ent in doc.ents]
-    #print(candidates)

    if labels is not None:
        query = comment
@ -131,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
        print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
        for i in range(len(gold)):
            elem = gold.iloc[i]
-            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
+            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
        # print('\n')

    return candidates, gold
@ -168,6 +143,13 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
    return model


+def connect_aut_work(list_aut, list_work, kb):
+    print('\n\nTODO')
+    # qid_list = [kb.aut2id[author] for author in list_aut]
+    # wid_list = [kb.works2aut[work] for work in list_work]
+    # print('lel')
+
+
 def main():
    df_TRAIN = df_monarchia
    df_eval = df_convivio
@ -178,21 +160,21 @@ def main():
    raw_commentaries_convivio = dataset.commentaries_eva

    # train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
-    # nlp = spacy.load('it_core_news_sm')
-    # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
-    # dataset_convivio = DataSetBuilder(df_eval, df_eval)
-    # dataset_convivio.export_dataset_doccano('std_convivio')
-
    nlp = spacy.load('./model_fastText/model_spacy_latest')
-    # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
-    # print(len(list(nlp.vocab.strings))) # get whole model vocabulary

    seed = random.randint(1, len(commentaries_convivio_eva))
    preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)

-    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
-    kb.link_entities(preds, deepfuzz=True)
-    print(f'\nComment Numbert: {seed}')
+    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
+    aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
+
+    # Testing ------------------------
+    # connect_aut_work(aut_res, work_res, kb)
+    # --------------------------------
+
+    print(f'\nComment Number: {seed}')
+
+    # TODO: add a matcher that returns s_char and end_char of the matched entities!
    
    exit()

--- a/preprocessing/ner_dataset_builder.py
+++ b/preprocessing/ner_dataset_builder.py
@ -92,7 +92,6 @@ class DataSetBuilder:
        self.ner_clean_lookup = ner_clean_lookup

        return ner_lookup, ner_clean_lookup 
-    

    def _annotate_commentaries(self):
        """
@ -124,7 +123,6 @@ class DataSetBuilder:
            matches_in_clean_commentaries.append(res)
        
        return matches_in_commentaries, matches_in_clean_commentaries
-
    
    def build_train_data(self):
        from collections import OrderedDict
@ -142,7 +140,6 @@ class DataSetBuilder:
        self.TRAIN_DATA = TRAIN_DATA
        return TRAIN_DATA

-
    def get_rehearsal_data(self):
        revision_data = []
        print('# NB: TAGGING WITH standard spacy model!')
@ -155,13 +152,12 @@ class DataSetBuilder:
        self.revision_data = revision_data
        return revision_data

-
    def train_model(self):
        from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/
        import random

        nlp_std = spacy.load(self.SPACY_MODEL_STD)
-      
+
        #revision_data = self.get_rehearsal_data()

        #REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150]
@ -178,10 +174,10 @@ class DataSetBuilder:
        other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions]

        optimizer = nlp_std.resume_training()
-        
+
        with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings():
            warnings.filterwarnings("once", category=UserWarning, module='spacy')
-            
+
            n_epochs = 10
            #batch_size = 32
            print(f'\n## Begin Training')
@ -191,9 +187,7 @@ class DataSetBuilder:
                random.shuffle(REHEARSAL_DATA)
                batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
-                #for batch in partition_all(batch_size, REHEARSAL_DATA):
                    docs, golds = zip(*batch)
-                    #texts, annotations = zip(*batch)
                    nlp_std.update(docs, golds, sgd=optimizer, losses=losses)
                print(f'loss: {losses}')

@ -221,7 +215,6 @@ class DataSetBuilder:
        for ent in doc_STD.ents:
            print(ent.text, ent.label_)

-
    def export_dataset_doccano(self, outputfile_name):
        """
        Doccano JSONL data format:
@ -239,7 +232,6 @@ class DataSetBuilder:
        with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer: 
            writer.write_all(output)

-
    def merge_rehearsed(self, data, revision_data):
        res = []
        revision_data = revision_data
@ -258,7 +250,6 @@ class DataSetBuilder:

        return res

-  
    def import_dataset_doccano(self, path):
        data = []
        with open(path) as infile:
@ -266,20 +257,20 @@ class DataSetBuilder:

        for line in content:
            json_data = json.loads(line)
-            ent = {'entities':[]}
+            ent = {'entities': []}
            ent['entities'] = json_data['labels']
            data.append((json_data['text'], ent))

        self.TRAIN_DATA = data
        return data    
       
-if __name__ == '__main__':
-    data = DataSetBuilder(df_commentary, df_ner_unique)
-
-    ner_lookup, ner_clean_lookup = data.get_NER_lookup()
-    data.get_commentaries()
-    #data.build_train_data()
-    #data.train_model() 
-    #data.export_dataset_doccano()
-    #data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
-    #data.train_model() 
+# if __name__ == '__main__':
+#     data = DataSetBuilder(df_commentary, df_ner_unique)
+#
+#     ner_lookup, ner_clean_lookup = data.get_NER_lookup()
+#     data.get_commentaries()
+#     data.build_train_data()
+#     data.train_model()
+#     data.export_dataset_doccano()
+#     data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
+#     data.train_model()