minor fixes and added some comments

2020-10-28 09:52:45 +01:00 · 2020-10-28 09:52:45 +01:00 · c084babebd
parent 6d65bfb03d
commit c084babebd
11 changed files with 34 additions and 76 deletions
--- a/diff_sent.txt
+++ b/diff_sent.txt
@ -1,4 +0,0 @@
 si tratta di Bonifacio II (1150-1207) che fu uno dei capi della IV Crociata conclusasi con la presa 
 di Costantinopoli (1204) e la creazione dell'Impero latino di Oriente (Bonifacio ebbe il regno di Te
 ssalonica). I trovatori provenzali a lui contemporanei esaltarono l'ospitalità generosa della sua co
 rte (Rambaut de Vaqueiras lo elogia paragonandolo proprio ad Alessandro Magno).
--- a/entity_linker/KB_builder.py
+++ b/entity_linker/KB_builder.py
@ -7,7 +7,7 @@ import json
 import time
-def testing_dbpedia(author):
+def query_dbpedia(author):
    endpoint = 'http://dbpedia.org/sparql'
    s = sparql.Service(endpoint, "utf-8", "SELECT")
    query_author = """SELECT ?names WHERE {{
@ -23,7 +23,7 @@ def testing_dbpedia(author):
    return [result, results_works]
-def testing_wikidata(entity_q):
+def query_wikidata(entity_q):
    """
    Notable work = P800
    Date of birth = P569
@ -95,11 +95,11 @@ full_auth_list = author_uri_list_convivio + author_uri_list_monarchia + author_u
 dict_res = {}
 print(f'# Number of authors: {len(full_auth_list)}')
 for auth in tqdm.tqdm(full_auth_list):
-    entity_q = testing_dbpedia(auth)[0]
+    entity_q = query_dbpedia(auth)[0]
    wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
    dict_res[wikidata_endp] = None
    if wikidata_endp is not None:
-        _, names, works, other_works, y_birth = testing_wikidata(wikidata_endp)
+        _, names, works, other_works, y_birth = query_wikidata(wikidata_endp)
        dict_res[wikidata_endp] = {'aut_name': names,
                                   'aut_works': works,
                                   'aut_present_work': other_works,
--- a/entity_linker/_merge_kbs.py
+++ b/entity_linker/_merge_kbs.py
@ -1,49 +0,0 @@
 import pickle
 from pprint import pprint
 """
 with open('./KB_abs_convivio.pickle', 'rb') as infile:
    kb1 = pickle.load(infile)
 print(len(kb1))
 with open('./KB_abs_monarchia.pickle', 'rb') as infile: 
    kb2 = pickle.load(infile)
 print(len(kb2))
 def merge_dicts(iter_dict):
    merged = {}
    for i, dict in enumerate(iter_dict):
        if i == 0:
            merged = iter_dict[i]
            continue
        else:
            for k, v in iter_dict[i].items():
                if k not in merged.keys():
                    merged[k]  = v
    return merged
 merged = merge_dicts([kb1, kb2])
 with open('./KB_abs_merged.pickle', 'wb') as infile:
    pickle.dump(merged, infile)
 from pprint import pprint
 pprint(merged['Giles_of_Rome'])
 """
 with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
    kb = pickle.load(infile)
 reversed_dict = {}
 for key in kb.keys():
    name_list = kb[key]['names']
    to_add = {'name':'None', 'birth':'None', 'abstract':'None'}
    for name in name_list:
        to_add['name'] = key
        to_add['birth'] = kb[key]['birth']
        to_add['abstract'] = kb[key]['abstract']
        reversed_dict[name] = to_add
 print(len(reversed_dict))
 with open('knowledge_base/KB_abs_reversed.pickle', 'wb') as outfile:
    pickle.dump(reversed_dict, outfile)
--- a/entity_linker/knowledge_base.py
+++ b/entity_linker/knowledge_base.py
@ -18,15 +18,22 @@ class KnowledgeBase:
            self._popolate_aut2id()
    def link_entities(self, preds, deepfuzz=False):
        COMMEDIA_DATE = 1321
        PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
        WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
-        # print('-'*50)
+        print('\nMODEL RAW PREDICTIONS:')
-        # print(f'Candidate authors (i.e., entities matched): {PER_preds}')
+        print(f'Candidate authors (i.e., entities matched): {PER_preds}')
-        # print(f'Candidates work :{WORK_preds}')
+        print(f'Candidates work :{WORK_preds}')
-
+        print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:')
        COMMEDIA_DATE = 1321
        print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
        """
        Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according
        to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure
        is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a 
        metric is computed beteen the original NER match and every author name present in the KB but also split on
        the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as
        simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned.
        """
        aut_res = []
        for target in set(PER_preds):
            scores = []
@ -40,7 +47,6 @@ class KnowledgeBase:
            for i in range(3):
                if scores[i][1] > .8:
                    print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
                    #, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
                    success = True
                    aut_res.append(target)
                    break
@ -55,13 +61,18 @@ class KnowledgeBase:
                        deepscore.append((aut, sim))
                deepscore.sort(key=lambda tup: tup[1], reverse=True)
                for j in range(3):
-                    if deepscore[j][1] > .9:
+                    if deepscore[j][1] > .8:
                        print(
                            f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
                        #, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
                        aut_res.append(target)
                        break
        """
        Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary) 
        according to the edit distance computed across all of the titles in the KB. The similarity measure
        is computed between the match and the exact name stored in the KB.
        Once sorted, the first element satisfying the given threshold is returned.
        """
        work_res = {}
        if len(WORK_preds) != 0:
            print('-' * 50 + '\n### Works matches:')
@ -72,7 +83,7 @@ class KnowledgeBase:
                    scores_work.append((work, sim))
                scores_work.sort(key=lambda tup: tup[1], reverse=True)
                for i in range(3):
-                    if scores_work[i][1] > .75:
+                    if scores_work[i][1] > .7:
                        print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
                        work_res[target] = self.works2aut[scores_work[i][0]]
                        break
--- a/entity_linker/knowledge_base/KB_abs_merged.pickle
+++ b/entity_linker/knowledge_base/KB_abs_merged.pickle
--- a/entity_linker/knowledge_base/KB_abs_reversed.pickle
+++ b/entity_linker/knowledge_base/KB_abs_reversed.pickle
--- a/entity_linker/knowledge_base/KB_wikimedia.json
+++ b/entity_linker/knowledge_base/KB_wikimedia.json
--- a/entity_linker/knowledge_base/KB_wikimedia_with_dates.json
+++ b/entity_linker/knowledge_base/KB_wikimedia_with_dates.json
--- a/entity_linker/knowledge_base/full3_output.json
+++ b/entity_linker/knowledge_base/full3_output.json
--- a/entity_linker/output_monarchia.json
+++ b/entity_linker/output_monarchia.json
--- a/main.py
+++ b/main.py
@ -14,8 +14,11 @@ COMMENTARIES_PATH = './commentaries/'
 DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
 df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
 df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
 df_rime = pd.read_csv(DF_COMMENTARIES_PATH + 'rime_DF.csv')
 df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
 # TODO --> rime.xml
 def pprint_com(comment, l=100):
    i = 0
@ -103,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
    if labels is not None:
        query = comment
        gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
-        print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
+        print(f'{len(gold)} GOLD TARGETS:')
        for i in range(len(gold)):
            elem = gold.iloc[i]
            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
@ -147,13 +150,13 @@ def connect_aut_work(list_aut, list_work, kb):
    print('\n\nTODO')
    # qid_list = [kb.aut2id[author] for author in list_aut]
    # wid_list = [kb.works2aut[work] for work in list_work]
    # print('lel')
 def main():
    df_TRAIN = df_monarchia
-    df_eval = df_convivio
+    df_eval = df_rime
    dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
    TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
    commentaries_convivio_eva = dataset.clean_commentaries_eva
    commentaries_monarchia = dataset.clean_commentaries
@ -163,9 +166,9 @@ def main():
    nlp = spacy.load('./model_fastText/model_spacy_latest')
    seed = random.randint(1, len(commentaries_convivio_eva))
    preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
-    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
+    preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia.json', extension=df_eval)
    aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
    # Testing ------------------------