minor fixes and added some comments

2020-10-28 09:52:45 +01:00 · 2020-10-28 09:52:45 +01:00 · c084babebd
parent 6d65bfb03d
commit c084babebd
11 changed files with 34 additions and 76 deletions
--- a/diff_sent.txt
+++ b/diff_sent.txt
@ -1,4 +0,0 @@
-si tratta di Bonifacio II (1150-1207) che fu uno dei capi della IV Crociata conclusasi con la presa 
-di Costantinopoli (1204) e la creazione dell'Impero latino di Oriente (Bonifacio ebbe il regno di Te
-ssalonica). I trovatori provenzali a lui contemporanei esaltarono l'ospitalità generosa della sua co
-rte (Rambaut de Vaqueiras lo elogia paragonandolo proprio ad Alessandro Magno).
--- a/entity_linker/KB_builder.py
+++ b/entity_linker/KB_builder.py
@ -7,7 +7,7 @@ import json
 import time


-def testing_dbpedia(author):
+def query_dbpedia(author):
    endpoint = 'http://dbpedia.org/sparql'
    s = sparql.Service(endpoint, "utf-8", "SELECT")
    query_author = """SELECT ?names WHERE {{
@ -23,7 +23,7 @@ def testing_dbpedia(author):
    return [result, results_works]


-def testing_wikidata(entity_q):
+def query_wikidata(entity_q):
    """
    Notable work = P800
    Date of birth = P569
@ -95,11 +95,11 @@ full_auth_list = author_uri_list_convivio + author_uri_list_monarchia + author_u
 dict_res = {}
 print(f'# Number of authors: {len(full_auth_list)}')
 for auth in tqdm.tqdm(full_auth_list):
-    entity_q = testing_dbpedia(auth)[0]
+    entity_q = query_dbpedia(auth)[0]
    wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
    dict_res[wikidata_endp] = None
    if wikidata_endp is not None:
-        _, names, works, other_works, y_birth = testing_wikidata(wikidata_endp)
+        _, names, works, other_works, y_birth = query_wikidata(wikidata_endp)
        dict_res[wikidata_endp] = {'aut_name': names,
                                   'aut_works': works,
                                   'aut_present_work': other_works,
--- a/entity_linker/_merge_kbs.py
+++ b/entity_linker/_merge_kbs.py
@ -1,49 +0,0 @@
-import pickle
-from pprint import pprint
-"""
-with open('./KB_abs_convivio.pickle', 'rb') as infile:
-    kb1 = pickle.load(infile)
-print(len(kb1))
-
-with open('./KB_abs_monarchia.pickle', 'rb') as infile: 
-    kb2 = pickle.load(infile)
-print(len(kb2))
-
-
-def merge_dicts(iter_dict):
-    merged = {}
-    for i, dict in enumerate(iter_dict):
-        if i == 0:
-            merged = iter_dict[i]
-            continue
-        else:
-            for k, v in iter_dict[i].items():
-                if k not in merged.keys():
-                    merged[k]  = v
-    return merged
-
-merged = merge_dicts([kb1, kb2])
-
-with open('./KB_abs_merged.pickle', 'wb') as infile:
-    pickle.dump(merged, infile)
-
-from pprint import pprint
-pprint(merged['Giles_of_Rome'])
-"""
-with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
-    kb = pickle.load(infile)
-
-reversed_dict = {}
-for key in kb.keys():
-    name_list = kb[key]['names']
-    to_add = {'name':'None', 'birth':'None', 'abstract':'None'}
-    for name in name_list:
-        to_add['name'] = key
-        to_add['birth'] = kb[key]['birth']
-        to_add['abstract'] = kb[key]['abstract']
-        reversed_dict[name] = to_add
-
-print(len(reversed_dict))
-        
-with open('knowledge_base/KB_abs_reversed.pickle', 'wb') as outfile:
-    pickle.dump(reversed_dict, outfile)
--- a/entity_linker/knowledge_base.py
+++ b/entity_linker/knowledge_base.py
@ -18,15 +18,22 @@ class KnowledgeBase:
            self._popolate_aut2id()

    def link_entities(self, preds, deepfuzz=False):
+        COMMEDIA_DATE = 1321
        PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
        WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
-        # print('-'*50)
-        # print(f'Candidate authors (i.e., entities matched): {PER_preds}')
-        # print(f'Candidates work :{WORK_preds}')
-
-        COMMEDIA_DATE = 1321
-        print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
+        print('\nMODEL RAW PREDICTIONS:')
+        print(f'Candidate authors (i.e., entities matched): {PER_preds}')
+        print(f'Candidates work :{WORK_preds}')
+        print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:')

+        """
+        Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according
+        to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure
+        is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a 
+        metric is computed beteen the original NER match and every author name present in the KB but also split on
+        the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as
+        simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned.
+        """
        aut_res = []
        for target in set(PER_preds):
            scores = []
@ -40,7 +47,6 @@ class KnowledgeBase:
            for i in range(3):
                if scores[i][1] > .8:
                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
-                    #, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
                    success = True
                    aut_res.append(target)
                    break
@ -55,13 +61,18 @@ class KnowledgeBase:
                        deepscore.append((aut, sim))
                deepscore.sort(key=lambda tup: tup[1], reverse=True)
                for j in range(3):
-                    if deepscore[j][1] > .9:
+                    if deepscore[j][1] > .8:
                        print(
                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
-                        #, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
                        aut_res.append(target)
                        break

+        """
+        Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary) 
+        according to the edit distance computed across all of the titles in the KB. The similarity measure
+        is computed between the match and the exact name stored in the KB.
+        Once sorted, the first element satisfying the given threshold is returned.
+        """
        work_res = {}
        if len(WORK_preds) != 0:
            print('-' * 50 + '\n### Works matches:')
@ -72,7 +83,7 @@ class KnowledgeBase:
                    scores_work.append((work, sim))
                scores_work.sort(key=lambda tup: tup[1], reverse=True)
                for i in range(3):
-                    if scores_work[i][1] > .75:
+                    if scores_work[i][1] > .7:
                        print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
                        work_res[target] = self.works2aut[scores_work[i][0]]
                        break
--- a/entity_linker/knowledge_base/KB_abs_merged.pickle
+++ b/entity_linker/knowledge_base/KB_abs_merged.pickle
--- a/entity_linker/knowledge_base/KB_abs_reversed.pickle
+++ b/entity_linker/knowledge_base/KB_abs_reversed.pickle
--- a/entity_linker/knowledge_base/KB_wikimedia.json
+++ b/entity_linker/knowledge_base/KB_wikimedia.json
--- a/entity_linker/knowledge_base/KB_wikimedia_with_dates.json
+++ b/entity_linker/knowledge_base/KB_wikimedia_with_dates.json
--- a/entity_linker/knowledge_base/full3_output.json
+++ b/entity_linker/knowledge_base/full3_output.json
--- a/entity_linker/output_monarchia.json
+++ b/entity_linker/output_monarchia.json
--- a/main.py
+++ b/main.py
@ -14,8 +14,11 @@ COMMENTARIES_PATH = './commentaries/'
 DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
 df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
 df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
+df_rime = pd.read_csv(DF_COMMENTARIES_PATH + 'rime_DF.csv')
 df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')

+# TODO --> rime.xml
+

 def pprint_com(comment, l=100):
    i = 0
@ -103,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
    if labels is not None:
        query = comment
        gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
-        print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
+        print(f'{len(gold)} GOLD TARGETS:')
        for i in range(len(gold)):
            elem = gold.iloc[i]
            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
@ -147,13 +150,13 @@ def connect_aut_work(list_aut, list_work, kb):
    print('\n\nTODO')
    # qid_list = [kb.aut2id[author] for author in list_aut]
    # wid_list = [kb.works2aut[work] for work in list_work]
-    # print('lel')


 def main():
    df_TRAIN = df_monarchia
-    df_eval = df_convivio
+    df_eval = df_rime
    dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
+
    TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
    commentaries_convivio_eva = dataset.clean_commentaries_eva
    commentaries_monarchia = dataset.clean_commentaries
@ -163,9 +166,9 @@ def main():
    nlp = spacy.load('./model_fastText/model_spacy_latest')

    seed = random.randint(1, len(commentaries_convivio_eva))
-    preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)

-    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
+    preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
+    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia.json', extension=df_eval)
    aut_res, work_res = kb.link_entities(preds, deepfuzz=True)

    # Testing ------------------------