function to extend KB with data from annotated commentaries

2020-10-23 16:59:01 +02:00 · 2020-10-23 16:59:01 +02:00 · 2324ddff9f
parent fb84b36b90
commit 2324ddff9f
5 changed files with 117 additions and 103 deletions
--- a/entity_linker/knowledge_base.py
+++ b/entity_linker/knowledge_base.py
@ -1,30 +1,33 @@
 # TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
 from difflib import SequenceMatcher
 import json
 import numpy as np
 class KnowledgeBase:
-
+    def __init__(self, kb_path, extension=None):
    def __init__(self, kb_path):
        with open(kb_path, 'rb') as infile:
            data = json.load(infile)
        self.id2aut = data
        self.aut2id = {}
        self.works2aut = {}
        self._popolate_aut2id()
        if extension is not None:
            self._extend_kb(extension)
            self._popolate_aut2id()
    def link_entities(self, preds, deepfuzz=False):
        PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
        WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
-        print('-'*50)
+        # print('-'*50)
-        print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
+        # print(f'Candidate authors (i.e., entities matched): {PER_preds}')
-        # print(f'Candidates work:\n{WORK_preds}')
+        # print(f'Candidates work :{WORK_preds}')
        COMMEDIA_DATE = 1321
-        print('-'*50 + '\nChecking in KB...')
+        print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
        # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
        aut_res = []
        for target in set(PER_preds):
            scores = []
            deepscore = []
@ -36,8 +39,10 @@ class KnowledgeBase:
            success = False
            for i in range(3):
                if scores[i][1] > .8:
-                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
+                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
                    #, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
                    success = True
                    aut_res.append(target)
                    break
            if deepfuzz and not success:
                for aut in self.aut2id.keys():
@ -50,38 +55,74 @@ class KnowledgeBase:
                        deepscore.append((aut, sim))
                deepscore.sort(key=lambda tup: tup[1], reverse=True)
                for j in range(3):
-                    if deepscore[j][1] > .8:
+                    if deepscore[j][1] > .9:
                        print(
-                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
+                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
                        #, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
                        aut_res.append(target)
                        break
        return 0
-    def _generate_utter_2_ent(self):
+        work_res = {}
-        utt_2_ent = {}
+        if len(WORK_preds) != 0:
-        for ent_en in self.kb.keys():
+            print('-' * 50 + '\n### Works matches:')
-            for utt in self.kb[ent_en]['names']:
+            for target in set(WORK_preds):
-                utt_2_ent[utt] = ent_en
+                scores_work = []
-        return utt_2_ent
+                for work in self.works2aut.keys():
                    sim = self._similar(target, work)
                    scores_work.append((work, sim))
                scores_work.sort(key=lambda tup: tup[1], reverse=True)
                for i in range(3):
                    if scores_work[i][1] > .75:
                        print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
                        work_res[target] = self.works2aut[scores_work[i][0]]
                        break
-    def _check_other_lang(self, target, original_name):
+        return aut_res, work_res
        other_names = self.kb[target[0]]['names']
        scores = []
        for name in other_names:
            sim = self._similar(original_name, name)
            scores.append((name, sim))
        scores.sort(key=lambda tup: tup[1], reverse=True)
        return scores
    def _similar(self,a, b):
        return SequenceMatcher(None, a, b).ratio()
    def _popolate_aut2id(self):
        for qid, values in self.id2aut.items():
            if qid == 'null':
                continue
            if values is not None:
                l_names = set(values['aut_name'].values())
                for name in l_names:
                    self.aut2id[name] = qid
            works = values['aut_works']
            if len(works) != 0:
                for wid, wvalues in works.items():
                    try:
                        self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
                    except:
                        continue
        return self
    def _extend_kb(self, df):
        _qid = 0
        prev_work = ''
        for i in range(len(df)):
            row = df.iloc[i]
            auth = row.quot_author
            work = row.quot_title
            if auth is not np.nan and work is not np.nan:
                if work != prev_work:
                    try:
                        qid = self.aut2id[auth]
                        new_wid = f'W{_qid}'
                        _qid += 1
                        self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
                        prev_work = work
                    except:
                        new_qid = f'Q{str(_qid)}'
                        new_wid = f'W{str(_qid)}'
                        _qid += 1
                        self.id2aut[new_qid] = {'aut_name': {'it': auth},
                                                'aut_works': {new_wid: {'it': work}},
                                                'aut_present_work': {},
                                                'birth': 0}
                        prev_work = work
                else:
                    continue
--- a/entity_linker/knowledge_base/KB_abs_merged.pickle
+++ b/entity_linker/knowledge_base/KB_abs_merged.pickle
--- a/entity_linker/knowledge_base/KB_abs_reversed.pickle
+++ b/entity_linker/knowledge_base/KB_abs_reversed.pickle
--- a/main.py
+++ b/main.py
@ -5,9 +5,8 @@ from spacy.util import minibatch, compounding
 import warnings
 from preprocessing.ner_dataset_builder import DataSetBuilder
 from entity_linker.knowledge_base import KnowledgeBase
-from tqdm import tqdm
+from tqdm import tqdm, trange
 from pathlib import Path
 import pickle
 import random
@ -18,7 +17,16 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
 df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
-def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
+def pprint_com(comment, l=100):
    i = 0
    while len(comment) > i + 100:
        j = i + l
        print(comment[i:j])
        i += 100
    print(comment[i:len(comment)])
 def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'):
        model = spacy.load(SPACY_MODEL_STD)
        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -36,35 +44,22 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
        with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
            print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
            # warnings.filterwarnings("once", category=UserWarning, module='spacy')
            optimizer = model.resume_training()
-
+            n_epochs = nepochs
            n_epochs = 7
            #batch_size = 32
            print(f'\n## Begin Training')
-            for i in tqdm(range(n_epochs), desc='Iter'):
+            t = trange(n_epochs, desc='Iter')
-                #print(f'Iteration {i+1}')
+            for i in t:
                losses = {}
                random.shuffle(TRAIN_DATA)
                batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    docs, golds = zip(*batch)
                    model.update(docs, golds, sgd=optimizer, losses=losses)
                t.set_description(f'NER loss: {round(losses["ner"], 5)}')
            print(f'Final loss: {losses}')
        seed = random.randint(1, len(clean_commentaries))
        def pprint_com(comment, l=100):
            i = 0
            while len(comment) > i+100:
                j = i+l
                print(comment[i:j])
                i += 100
            print(comment[i:len(comment)])
        disabled.restore()
        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
        eg_eval = df_eval.iloc[seed]
@ -80,21 +75,11 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
        for ent in doc.ents:
          print(ent.text, ent.label_)
        """
        print('\n')
        print('-'*50)
        print('STANDARD NER MODEL PREDICTIONS:')
        nlp_reloaded = spacy.load('it_core_news_sm')
        doc_STD = nlp_reloaded(clean_comment)
        for ent in doc_STD.ents:
            print(ent.text, ent.label_)
        """
        print('\n')
        print('-'*50)
        print('GOLD:')
        query = eg_eval['comment']
-        gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
+        gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']]
        print(gold)
        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -104,15 +89,6 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
 def predict_candidates(model, comment, labels=None):
    def pprint_com(comment, l=100):
               i = 0
               while len(comment) > i+100:
                   j = i+l
                   print(comment[i:j])
                   i += 100
               print(comment[i:len(comment)])
    clean_comment = comment.replace('<i>', '')
    clean_comment = clean_comment.replace('</i>', '')
    clean_comment = clean_comment.replace('\\', '')
@ -123,7 +99,6 @@ def predict_candidates(model, comment, labels=None):
    print('\n')               
    candidates = [(ent.text, ent.label_) for ent in doc.ents]
    #print(candidates)
    if labels is not None:
        query = comment
@ -131,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
        print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
        for i in range(len(gold)):
            elem = gold.iloc[i]
-            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
+            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
        # print('\n')
    return candidates, gold
@ -168,6 +143,13 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
    return model
 def connect_aut_work(list_aut, list_work, kb):
    print('\n\nTODO')
    # qid_list = [kb.aut2id[author] for author in list_aut]
    # wid_list = [kb.works2aut[work] for work in list_work]
    # print('lel')
 def main():
    df_TRAIN = df_monarchia
    df_eval = df_convivio
@ -178,21 +160,21 @@ def main():
    raw_commentaries_convivio = dataset.commentaries_eva
    # train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
    # nlp = spacy.load('it_core_news_sm')
    # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
    # dataset_convivio = DataSetBuilder(df_eval, df_eval)
    # dataset_convivio.export_dataset_doccano('std_convivio')
    nlp = spacy.load('./model_fastText/model_spacy_latest')
    # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
    # print(len(list(nlp.vocab.strings))) # get whole model vocabulary
    seed = random.randint(1, len(commentaries_convivio_eva))
    preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
-    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
+    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
-    kb.link_entities(preds, deepfuzz=True)
+    aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
-    print(f'\nComment Numbert: {seed}')
+
    # Testing ------------------------
    # connect_aut_work(aut_res, work_res, kb)
    # --------------------------------
    print(f'\nComment Number: {seed}')
    # TODO: add a matcher that returns s_char and end_char of the matched entities!
    exit()
--- a/preprocessing/ner_dataset_builder.py
+++ b/preprocessing/ner_dataset_builder.py
@ -92,7 +92,6 @@ class DataSetBuilder:
        self.ner_clean_lookup = ner_clean_lookup
        return ner_lookup, ner_clean_lookup 
    def _annotate_commentaries(self):
        """
@ -124,7 +123,6 @@ class DataSetBuilder:
            matches_in_clean_commentaries.append(res)
        return matches_in_commentaries, matches_in_clean_commentaries
    def build_train_data(self):
        from collections import OrderedDict
@ -142,7 +140,6 @@ class DataSetBuilder:
        self.TRAIN_DATA = TRAIN_DATA
        return TRAIN_DATA
    def get_rehearsal_data(self):
        revision_data = []
        print('# NB: TAGGING WITH standard spacy model!')
@ -155,13 +152,12 @@ class DataSetBuilder:
        self.revision_data = revision_data
        return revision_data
    def train_model(self):
        from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/
        import random
        nlp_std = spacy.load(self.SPACY_MODEL_STD)
-      
+
        #revision_data = self.get_rehearsal_data()
        #REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150]
@ -178,10 +174,10 @@ class DataSetBuilder:
        other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions]
        optimizer = nlp_std.resume_training()
-        
+
        with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings():
            warnings.filterwarnings("once", category=UserWarning, module='spacy')
-            
+
            n_epochs = 10
            #batch_size = 32
            print(f'\n## Begin Training')
@ -191,9 +187,7 @@ class DataSetBuilder:
                random.shuffle(REHEARSAL_DATA)
                batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                #for batch in partition_all(batch_size, REHEARSAL_DATA):
                    docs, golds = zip(*batch)
                    #texts, annotations = zip(*batch)
                    nlp_std.update(docs, golds, sgd=optimizer, losses=losses)
                print(f'loss: {losses}')
@ -221,7 +215,6 @@ class DataSetBuilder:
        for ent in doc_STD.ents:
            print(ent.text, ent.label_)
    def export_dataset_doccano(self, outputfile_name):
        """
        Doccano JSONL data format:
@ -239,7 +232,6 @@ class DataSetBuilder:
        with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer: 
            writer.write_all(output)
    def merge_rehearsed(self, data, revision_data):
        res = []
        revision_data = revision_data
@ -258,7 +250,6 @@ class DataSetBuilder:
        return res
    def import_dataset_doccano(self, path):
        data = []
        with open(path) as infile:
@ -266,20 +257,20 @@ class DataSetBuilder:
        for line in content:
            json_data = json.loads(line)
-            ent = {'entities':[]}
+            ent = {'entities': []}
            ent['entities'] = json_data['labels']
            data.append((json_data['text'], ent))
        self.TRAIN_DATA = data
        return data    
-if __name__ == '__main__':
+# if __name__ == '__main__':
-    data = DataSetBuilder(df_commentary, df_ner_unique)
+#     data = DataSetBuilder(df_commentary, df_ner_unique)
-
+#
-    ner_lookup, ner_clean_lookup = data.get_NER_lookup()
+#     ner_lookup, ner_clean_lookup = data.get_NER_lookup()
-    data.get_commentaries()
+#     data.get_commentaries()
-    #data.build_train_data()
+#     data.build_train_data()
-    #data.train_model() 
+#     data.train_model()
-    #data.export_dataset_doccano()
+#     data.export_dataset_doccano()
-    #data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
+#     data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
-    #data.train_model() 
+#     data.train_model()