diff --git a/.gitignore b/.gitignore index 37dc9c4..949775a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ /commentaries/*.xml /commentaries/*.xsd /commentaries/*.zip +/entity_linker/knowledge_base/*.pickle # User-specific stuff .idea/**/workspace.xml diff --git a/entity_linker/KB_builder.py b/entity_linker/KB_builder.py index 4836f9f..6518f2c 100644 --- a/entity_linker/KB_builder.py +++ b/entity_linker/KB_builder.py @@ -39,18 +39,23 @@ def testing_wikidata(entity_q): entity = client.get(entity_q, load=True) notable_work = client.get('P800') present_in_work = client.get('P1441') - # date_of_birth = client.get('P569') - # birth = entity.get(date_of_birth) # TODO: debug this + date_of_birth = client.get('P569') + aut_names = entity.label.texts _works = entity.get(notable_work) _present_in_work = entity.get(present_in_work) + _birth = entity.get(date_of_birth) + if _works is not None: for work in _works: dict_works[work.id] = work.label.texts if _present_in_work is not None: for p_work in _present_in_work: dict_present_in_works[p_work.id] = p_work.label.texts - return entity, aut_names, dict_works, dict_present_in_works + if _birth is not None: + _birth = _birth[0] + + return entity, aut_names, dict_works, dict_present_in_works, _birth def print_results(results): @@ -73,7 +78,7 @@ def extract_wikidata_endpoint(author_names, show_warnings=True): return endpoint except IndexError: if show_warnings: - warnings.warn('Entity has not a wikimdata endpoint ') + warnings.warn('Entity has not a wikidata endpoint ') return None @@ -94,12 +99,13 @@ for auth in tqdm.tqdm(full_auth_list): wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False) dict_res[wikidata_endp] = None if wikidata_endp is not None: - _, names, works, other_works = testing_wikidata(wikidata_endp) + _, names, works, other_works, y_birth = testing_wikidata(wikidata_endp) dict_res[wikidata_endp] = {'aut_name': names, 'aut_works': works, - 'aut_present_work': other_works} + 'aut_present_work': other_works, + 'birth': y_birth} -with open('knowledge_base/KB_wikimedia.json', 'w+') as f: +with open('knowledge_base/KB_wikimedia_with_dates.json', 'w+') as f: json.dump(dict_res, f) print(f'# Process finished in: {round((time.time()-stime), 5)}') diff --git a/entity_linker/_merge_kbs.py b/entity_linker/_merge_kbs.py index c7bf435..dd8f63f 100644 --- a/entity_linker/_merge_kbs.py +++ b/entity_linker/_merge_kbs.py @@ -30,7 +30,7 @@ with open('./KB_abs_merged.pickle', 'wb') as infile: from pprint import pprint pprint(merged['Giles_of_Rome']) """ -with open('./KB_abs_merged.pickle', 'rb') as infile: +with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile: kb = pickle.load(infile) reversed_dict = {} @@ -45,5 +45,5 @@ for key in kb.keys(): print(len(reversed_dict)) -with open('./KB_abs_reversed.pickle', 'wb') as outfile: +with open('knowledge_base/KB_abs_reversed.pickle', 'wb') as outfile: pickle.dump(reversed_dict, outfile) \ No newline at end of file diff --git a/entity_linker/kb_fastText b/entity_linker/kb_fastText deleted file mode 100644 index ae91031..0000000 Binary files a/entity_linker/kb_fastText and /dev/null differ diff --git a/entity_linker/kb_test b/entity_linker/kb_test deleted file mode 100644 index 41eeff4..0000000 Binary files a/entity_linker/kb_test and /dev/null differ diff --git a/entity_linker/knowledge_base.py b/entity_linker/knowledge_base.py index 6ad6490..cd73afa 100644 --- a/entity_linker/knowledge_base.py +++ b/entity_linker/knowledge_base.py @@ -1,82 +1,62 @@ -""" -Should also evaluate IF and HOW actual spaCy KB could be deoloyed in this scenario - -https://github.com/seatgeek/fuzzywuzzy?source=post_page--------------------------- -""" # TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz - from difflib import SequenceMatcher -from pprint import pprint -import pickle +import json -class Knowledge_base: +class KnowledgeBase: def __init__(self, kb_path): with open(kb_path, 'rb') as infile: - data = pickle.load(infile) - - self.kb = data - #self.utt2ent = self._generate_utter_2_ent() + data = json.load(infile) + self.id2aut = data + self.aut2id = {} + self._popolate_aut2id() - def link_entities(self, preds): - PER_preds = [pred[0] for pred in preds if pred[1] == 'PER'] + def link_entities(self, preds, deepfuzz=False): + PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] + print('-'*50) print(f'Candidate authors (i.e., entitites matched): {PER_preds}') # print(f'Candidates work:\n{WORK_preds}') COMMEDIA_DATE = 1321 + print('-'*50 + '\nChecking in KB...') - """ - for target in set(PER_preds): - if target in self.utt2ent.keys(): - print(target, self.utt2ent[target]) - """ - print('#'*50 + '\nChecking in KB...') - - # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'aquino) + # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino) for target in set(PER_preds): scores = [] - for auth in self.kb.keys(): + deepscore = [] + for auth in self.aut2id.keys(): sim = self._similar(target, auth) scores.append((auth, sim)) scores.sort(key=lambda tup: tup[1], reverse=True) + success = False for i in range(3): if scores[i][1] > .8: - print(f'Prediction: {target} - {scores[i]} - born in {self.kb[scores[i][0]]["birth"]}') + print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}') + success = True break - #elif scores[0][1] == 0: - # print(f'Author {target} not in KB ') + if deepfuzz and not success: + for aut in self.aut2id.keys(): + _splitname = aut.split(' ') + sim = 0 + for split in _splitname: + _sim = self._similar(target, split) + if _sim > sim: + sim = _sim + deepscore.append((aut, sim)) + deepscore.sort(key=lambda tup: tup[1], reverse=True) + for j in range(3): + if deepscore[j][1] > .8: + print( + f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}') + break return 0 - """ - for target in set(PER_preds): - #print(f'TARGET: {target}') - - scores = [] - for auth in self.kb.keys(): - sim = self._similar(target, auth) - scores.append((auth, sim)) - - scores.sort(key=lambda tup: tup[1], reverse=True) - # pprint(scores[:3]) - - all_lang_scores = self._check_other_lang(scores[0], target) - - if all_lang_scores[0][1] >= 0.8: # with this threshold 'Tommaso' is not linked to 'Tommaso d'aquino' ... - print(f'TARGET: {target}') - print(f'{all_lang_scores[0][0]} was born in year: {self.kb[scores[0][0]]["birth"]}') - #print(all_lang_scores) - else: - continue - #print('Author not in KB') - print('-'*15) - - """ def _generate_utter_2_ent(self): utt_2_ent = {} for ent_en in self.kb.keys(): @@ -84,7 +64,6 @@ class Knowledge_base: utt_2_ent[utt] = ent_en return utt_2_ent - def _check_other_lang(self, target, original_name): other_names = self.kb[target[0]]['names'] @@ -97,3 +76,12 @@ class Knowledge_base: def _similar(self,a, b): return SequenceMatcher(None, a, b).ratio() + + def _popolate_aut2id(self): + for qid, values in self.id2aut.items(): + if values is not None: + l_names = set(values['aut_name'].values()) + for name in l_names: + self.aut2id[name] = qid + return self + diff --git a/entity_linker/knowledge_base_spacy.py b/entity_linker/knowledge_base_spacy.py index 1e599d9..96876c1 100644 --- a/entity_linker/knowledge_base_spacy.py +++ b/entity_linker/knowledge_base_spacy.py @@ -6,12 +6,13 @@ import numpy as np from tqdm import tqdm #with open('./KB_abs_reversed.pickle', 'rb') as infile: -with open('./KB_abs_merged.pickle', 'rb') as infile: +with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile: entities_dict = pickle.load(infile) print(f'Number of entities in original knowledge Base: {len(entities_dict)}') #print(entities_dict.keys()) + def load_word_vectors(model, path_to_vec, max_vec=100000): with open(path_to_vec, 'r') as infile: header = infile.readline() @@ -31,6 +32,7 @@ def load_word_vectors(model, path_to_vec, max_vec=100000): return model + def generate_IDs(entities_dict_keys): """ Entities dictionary keys are english spelled names (if such an entities is diff --git a/main.py b/main.py index f6efee0..0a16215 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ import numpy as np from spacy.util import minibatch, compounding import warnings from preprocessing.ner_dataset_builder import DataSetBuilder -from entity_linker.knowledge_base import Knowledge_base +from entity_linker.knowledge_base import KnowledgeBase from tqdm import tqdm from pathlib import Path import pickle @@ -18,11 +18,10 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv') df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv') -def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'): +def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'): model = spacy.load(SPACY_MODEL_STD) print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') - TRAIN_DATA = TRAIN_DATA ner = model.get_pipe('ner') @@ -64,7 +63,6 @@ def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPAC i += 100 print(comment[i:len(comment)]) - disabled.restore() print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') @@ -100,7 +98,7 @@ def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPAC print(gold) print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') - save_model(model, 'it_dante', output_dir) + save_model(model, 'it_dante_new', output_dir) return model @@ -130,11 +128,11 @@ def predict_candidates(model, comment, labels=None): if labels is not None: query = comment gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']] - print(f'{len(gold)} GOLD TARGETS ' + '#'*50) + print(f'{len(gold)} GOLD TARGETS ' + '-'*50) for i in range(len(gold)): elem = gold.iloc[i] print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}') - print('\n') + # print('\n') return candidates, gold @@ -173,28 +171,28 @@ def load_word_vectors(model, path_to_vec, max_vec=100000): def main(): df_TRAIN = df_monarchia df_eval = df_convivio - # df_eval = df_monarchia dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique) TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json') commentaries_convivio_eva = dataset.clean_commentaries_eva commentaries_monarchia = dataset.clean_commentaries raw_commentaries_convivio = dataset.commentaries_eva - #nlp = spacy.load('it_core_news_sm') - #nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000) - #nlp = train_model(nlp, TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/') - #dataset_convivio = DataSetBuilder(df_eval, df_eval) - #dataset_convivio.export_dataset_doccano('std_convivio') - nlp = spacy.load('./model_fastText/') - #nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000) - - #print(len(list(nlp.vocab.strings))) # get whole model vocabulary - + # train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest') + # nlp = spacy.load('it_core_news_sm') + # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000) + # dataset_convivio = DataSetBuilder(df_eval, df_eval) + # dataset_convivio.export_dataset_doccano('std_convivio') + + nlp = spacy.load('./model_fastText/model_spacy_latest') + # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000) + # print(len(list(nlp.vocab.strings))) # get whole model vocabulary + seed = random.randint(1, len(commentaries_convivio_eva)) preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval) - kb = Knowledge_base('./entity_linker/KB_abs_reversed.pickle') - kb.link_entities(preds) + kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json') + kb.link_entities(preds, deepfuzz=True) + print(f'\nComment Numbert: {seed}') exit() diff --git a/preprocessing/ner_dataset_builder.py b/preprocessing/ner_dataset_builder.py index 4eb4fb4..7b9309a 100644 --- a/preprocessing/ner_dataset_builder.py +++ b/preprocessing/ner_dataset_builder.py @@ -14,6 +14,7 @@ COMMENTARIES_PATH = './commentaries/' DF_COMMENTARIES_PATH = './commentaries/data_parsed/' df_commentary_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv') df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv') + """ df_ner_unique ATM contains terms found in "De Monarchia". The .csv file should contain all the occurrences of tagged terms across all of the (tagged) documents!