From 2324ddff9f629131c13e8910f24a1a9f6250d747 Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Fri, 23 Oct 2020 16:59:01 +0200
Subject: [PATCH] function to extend KB with data from annotated commentaries

---
 entity_linker/knowledge_base.py               |  97 +++++++++++++-----
 .../{ => knowledge_base}/KB_abs_merged.pickle | Bin
 .../KB_abs_reversed.pickle                    | Bin
 main.py                                       |  86 ++++++----------
 preprocessing/ner_dataset_builder.py          |  37 +++----
 5 files changed, 117 insertions(+), 103 deletions(-)
 rename entity_linker/{ => knowledge_base}/KB_abs_merged.pickle (100%)
 rename entity_linker/{ => knowledge_base}/KB_abs_reversed.pickle (100%)

diff --git a/entity_linker/knowledge_base.py b/entity_linker/knowledge_base.py
index cd73afa..6bdbc08 100644
--- a/entity_linker/knowledge_base.py
+++ b/entity_linker/knowledge_base.py
@@ -1,30 +1,33 @@
 # TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
 from difflib import SequenceMatcher
 import json
+import numpy as np
 
 
 class KnowledgeBase:
-
-    def __init__(self, kb_path):
+    def __init__(self, kb_path, extension=None):
         with open(kb_path, 'rb') as infile:
             data = json.load(infile)
 
         self.id2aut = data
         self.aut2id = {}
+        self.works2aut = {}
         self._popolate_aut2id()
+        if extension is not None:
+            self._extend_kb(extension)
+            self._popolate_aut2id()
 
     def link_entities(self, preds, deepfuzz=False):
         PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
         WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
-        print('-'*50)
-        print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
-        # print(f'Candidates work:\n{WORK_preds}')
+        # print('-'*50)
+        # print(f'Candidate authors (i.e., entities matched): {PER_preds}')
+        # print(f'Candidates work :{WORK_preds}')
 
         COMMEDIA_DATE = 1321
-        print('-'*50 + '\nChecking in KB...')
-
-        # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
+        print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
 
+        aut_res = []
         for target in set(PER_preds):
             scores = []
             deepscore = []
@@ -36,8 +39,10 @@ class KnowledgeBase:
             success = False
             for i in range(3):
                 if scores[i][1] > .8:
-                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
+                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
+                    #, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
                     success = True
+                    aut_res.append(target)
                     break
             if deepfuzz and not success:
                 for aut in self.aut2id.keys():
@@ -50,38 +55,74 @@ class KnowledgeBase:
                         deepscore.append((aut, sim))
                 deepscore.sort(key=lambda tup: tup[1], reverse=True)
                 for j in range(3):
-                    if deepscore[j][1] > .8:
+                    if deepscore[j][1] > .9:
                         print(
-                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
+                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
+                        #, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
+                        aut_res.append(target)
                         break
-            
-        return 0
 
-    def _generate_utter_2_ent(self):
-        utt_2_ent = {}
-        for ent_en in self.kb.keys():
-            for utt in self.kb[ent_en]['names']:
-                utt_2_ent[utt] = ent_en
-        return utt_2_ent
+        work_res = {}
+        if len(WORK_preds) != 0:
+            print('-' * 50 + '\n### Works matches:')
+            for target in set(WORK_preds):
+                scores_work = []
+                for work in self.works2aut.keys():
+                    sim = self._similar(target, work)
+                    scores_work.append((work, sim))
+                scores_work.sort(key=lambda tup: tup[1], reverse=True)
+                for i in range(3):
+                    if scores_work[i][1] > .75:
+                        print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
+                        work_res[target] = self.works2aut[scores_work[i][0]]
+                        break
 
-    def _check_other_lang(self, target, original_name):
-        other_names = self.kb[target[0]]['names']
-
-        scores = []
-        for name in other_names:
-            sim = self._similar(original_name, name)
-            scores.append((name, sim))
-        scores.sort(key=lambda tup: tup[1], reverse=True)
-        return scores
+        return aut_res, work_res
 
     def _similar(self,a, b):
         return SequenceMatcher(None, a, b).ratio()
 
     def _popolate_aut2id(self):
         for qid, values in self.id2aut.items():
+            if qid == 'null':
+                continue
             if values is not None:
                 l_names = set(values['aut_name'].values())
                 for name in l_names:
                     self.aut2id[name] = qid
+            works = values['aut_works']
+            if len(works) != 0:
+                for wid, wvalues in works.items():
+                    try:
+                        self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
+                    except:
+                        continue
+
         return self
 
+    def _extend_kb(self, df):
+        _qid = 0
+        prev_work = ''
+        for i in range(len(df)):
+            row = df.iloc[i]
+            auth = row.quot_author
+            work = row.quot_title
+            if auth is not np.nan and work is not np.nan:
+                if work != prev_work:
+                    try:
+                        qid = self.aut2id[auth]
+                        new_wid = f'W{_qid}'
+                        _qid += 1
+                        self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
+                        prev_work = work
+                    except:
+                        new_qid = f'Q{str(_qid)}'
+                        new_wid = f'W{str(_qid)}'
+                        _qid += 1
+                        self.id2aut[new_qid] = {'aut_name': {'it': auth},
+                                                'aut_works': {new_wid: {'it': work}},
+                                                'aut_present_work': {},
+                                                'birth': 0}
+                        prev_work = work
+                else:
+                    continue
\ No newline at end of file
diff --git a/entity_linker/KB_abs_merged.pickle b/entity_linker/knowledge_base/KB_abs_merged.pickle
similarity index 100%
rename from entity_linker/KB_abs_merged.pickle
rename to entity_linker/knowledge_base/KB_abs_merged.pickle
diff --git a/entity_linker/KB_abs_reversed.pickle b/entity_linker/knowledge_base/KB_abs_reversed.pickle
similarity index 100%
rename from entity_linker/KB_abs_reversed.pickle
rename to entity_linker/knowledge_base/KB_abs_reversed.pickle
diff --git a/main.py b/main.py
index 0a16215..76c920b 100644
--- a/main.py
+++ b/main.py
@@ -5,9 +5,8 @@ from spacy.util import minibatch, compounding
 import warnings
 from preprocessing.ner_dataset_builder import DataSetBuilder
 from entity_linker.knowledge_base import KnowledgeBase
-from tqdm import tqdm
+from tqdm import tqdm, trange
 from pathlib import Path
-import pickle
 import random
 
 
@@ -18,7 +17,16 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
 df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
 
 
-def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
+def pprint_com(comment, l=100):
+    i = 0
+    while len(comment) > i + 100:
+        j = i + l
+        print(comment[i:j])
+        i += 100
+    print(comment[i:len(comment)])
+
+
+def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'):
 
         model = spacy.load(SPACY_MODEL_STD)
         print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@@ -36,35 +44,22 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
 
         with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
             print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
-            # warnings.filterwarnings("once", category=UserWarning, module='spacy')
-            
             optimizer = model.resume_training()
-
-            n_epochs = 7
-            #batch_size = 32
+            n_epochs = nepochs
             print(f'\n## Begin Training')
-            for i in tqdm(range(n_epochs), desc='Iter'):
-                #print(f'Iteration {i+1}')
+            t = trange(n_epochs, desc='Iter')
+            for i in t:
                 losses = {}
                 random.shuffle(TRAIN_DATA)
                 batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
                 for batch in batches:
                     docs, golds = zip(*batch)
                     model.update(docs, golds, sgd=optimizer, losses=losses)
+                t.set_description(f'NER loss: {round(losses["ner"], 5)}')
             print(f'Final loss: {losses}')
 
         seed = random.randint(1, len(clean_commentaries))
-
-        def pprint_com(comment, l=100):
-            i = 0
-            while len(comment) > i+100:
-                j = i+l
-                print(comment[i:j])
-                i += 100
-            print(comment[i:len(comment)])
-
         disabled.restore()
-
         print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
     
         eg_eval = df_eval.iloc[seed]
@@ -80,21 +75,11 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
         for ent in doc.ents:
           print(ent.text, ent.label_)
 
-        """
-        print('\n')
-        print('-'*50)
-        print('STANDARD NER MODEL PREDICTIONS:')
-        nlp_reloaded = spacy.load('it_core_news_sm')
-        doc_STD = nlp_reloaded(clean_comment)
-        for ent in doc_STD.ents:
-            print(ent.text, ent.label_)
-        """
-
         print('\n')
         print('-'*50)
         print('GOLD:')
         query = eg_eval['comment']
-        gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
+        gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']]
         print(gold)
 
         print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@@ -104,15 +89,6 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
 
 
 def predict_candidates(model, comment, labels=None):
-    
-    def pprint_com(comment, l=100):
-               i = 0
-               while len(comment) > i+100:
-                   j = i+l
-                   print(comment[i:j])
-                   i += 100
-               print(comment[i:len(comment)])
-    
     clean_comment = comment.replace('<i>', '')
     clean_comment = clean_comment.replace('</i>', '')
     clean_comment = clean_comment.replace('\\', '')
@@ -123,7 +99,6 @@ def predict_candidates(model, comment, labels=None):
 
     print('\n')               
     candidates = [(ent.text, ent.label_) for ent in doc.ents]
-    #print(candidates)
 
     if labels is not None:
         query = comment
@@ -131,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
         print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
         for i in range(len(gold)):
             elem = gold.iloc[i]
-            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
+            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
         # print('\n')
 
     return candidates, gold
@@ -168,6 +143,13 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
     return model
 
 
+def connect_aut_work(list_aut, list_work, kb):
+    print('\n\nTODO')
+    # qid_list = [kb.aut2id[author] for author in list_aut]
+    # wid_list = [kb.works2aut[work] for work in list_work]
+    # print('lel')
+
+
 def main():
     df_TRAIN = df_monarchia
     df_eval = df_convivio
@@ -178,21 +160,21 @@ def main():
     raw_commentaries_convivio = dataset.commentaries_eva
 
     # train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
-    # nlp = spacy.load('it_core_news_sm')
-    # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
-    # dataset_convivio = DataSetBuilder(df_eval, df_eval)
-    # dataset_convivio.export_dataset_doccano('std_convivio')
-
     nlp = spacy.load('./model_fastText/model_spacy_latest')
-    # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
-    # print(len(list(nlp.vocab.strings))) # get whole model vocabulary
 
     seed = random.randint(1, len(commentaries_convivio_eva))
     preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
 
-    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
-    kb.link_entities(preds, deepfuzz=True)
-    print(f'\nComment Numbert: {seed}')
+    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
+    aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
+
+    # Testing ------------------------
+    # connect_aut_work(aut_res, work_res, kb)
+    # --------------------------------
+
+    print(f'\nComment Number: {seed}')
+
+    # TODO: add a matcher that returns s_char and end_char of the matched entities!
     
     exit()
 
diff --git a/preprocessing/ner_dataset_builder.py b/preprocessing/ner_dataset_builder.py
index 7b9309a..189edfe 100644
--- a/preprocessing/ner_dataset_builder.py
+++ b/preprocessing/ner_dataset_builder.py
@@ -92,7 +92,6 @@ class DataSetBuilder:
         self.ner_clean_lookup = ner_clean_lookup
 
         return ner_lookup, ner_clean_lookup 
-    
 
     def _annotate_commentaries(self):
         """
@@ -124,7 +123,6 @@ class DataSetBuilder:
             matches_in_clean_commentaries.append(res)
         
         return matches_in_commentaries, matches_in_clean_commentaries
-
     
     def build_train_data(self):
         from collections import OrderedDict
@@ -142,7 +140,6 @@ class DataSetBuilder:
         self.TRAIN_DATA = TRAIN_DATA
         return TRAIN_DATA
 
-
     def get_rehearsal_data(self):
         revision_data = []
         print('# NB: TAGGING WITH standard spacy model!')
@@ -155,13 +152,12 @@ class DataSetBuilder:
         self.revision_data = revision_data
         return revision_data
 
-
     def train_model(self):
         from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/
         import random
 
         nlp_std = spacy.load(self.SPACY_MODEL_STD)
-      
+
         #revision_data = self.get_rehearsal_data()
 
         #REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150]
@@ -178,10 +174,10 @@ class DataSetBuilder:
         other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions]
 
         optimizer = nlp_std.resume_training()
-        
+
         with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings():
             warnings.filterwarnings("once", category=UserWarning, module='spacy')
-            
+
             n_epochs = 10
             #batch_size = 32
             print(f'\n## Begin Training')
@@ -191,9 +187,7 @@ class DataSetBuilder:
                 random.shuffle(REHEARSAL_DATA)
                 batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001))
                 for batch in batches:
-                #for batch in partition_all(batch_size, REHEARSAL_DATA):
                     docs, golds = zip(*batch)
-                    #texts, annotations = zip(*batch)
                     nlp_std.update(docs, golds, sgd=optimizer, losses=losses)
                 print(f'loss: {losses}')
 
@@ -221,7 +215,6 @@ class DataSetBuilder:
         for ent in doc_STD.ents:
             print(ent.text, ent.label_)
 
-
     def export_dataset_doccano(self, outputfile_name):
         """
         Doccano JSONL data format:
@@ -239,7 +232,6 @@ class DataSetBuilder:
         with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer: 
             writer.write_all(output)
 
-
     def merge_rehearsed(self, data, revision_data):
         res = []
         revision_data = revision_data
@@ -258,7 +250,6 @@ class DataSetBuilder:
 
         return res
 
-  
     def import_dataset_doccano(self, path):
         data = []
         with open(path) as infile:
@@ -266,20 +257,20 @@ class DataSetBuilder:
 
         for line in content:
             json_data = json.loads(line)
-            ent = {'entities':[]}
+            ent = {'entities': []}
             ent['entities'] = json_data['labels']
             data.append((json_data['text'], ent))
 
         self.TRAIN_DATA = data
         return data    
        
-if __name__ == '__main__':
-    data = DataSetBuilder(df_commentary, df_ner_unique)
-
-    ner_lookup, ner_clean_lookup = data.get_NER_lookup()
-    data.get_commentaries()
-    #data.build_train_data()
-    #data.train_model() 
-    #data.export_dataset_doccano()
-    #data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
-    #data.train_model() 
+# if __name__ == '__main__':
+#     data = DataSetBuilder(df_commentary, df_ner_unique)
+#
+#     ner_lookup, ner_clean_lookup = data.get_NER_lookup()
+#     data.get_commentaries()
+#     data.build_train_data()
+#     data.train_model()
+#     data.export_dataset_doccano()
+#     data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
+#     data.train_model()