- build kb via wikidata

- second level of fuzzy matching
This commit is contained in:
andrea 2020-10-22 20:39:24 +02:00
parent d95e596ee8
commit fb84b36b90
9 changed files with 78 additions and 82 deletions

1
.gitignore vendored
View File

@ -8,6 +8,7 @@
/commentaries/*.xml /commentaries/*.xml
/commentaries/*.xsd /commentaries/*.xsd
/commentaries/*.zip /commentaries/*.zip
/entity_linker/knowledge_base/*.pickle
# User-specific stuff # User-specific stuff
.idea/**/workspace.xml .idea/**/workspace.xml

View File

@ -39,18 +39,23 @@ def testing_wikidata(entity_q):
entity = client.get(entity_q, load=True) entity = client.get(entity_q, load=True)
notable_work = client.get('P800') notable_work = client.get('P800')
present_in_work = client.get('P1441') present_in_work = client.get('P1441')
# date_of_birth = client.get('P569') date_of_birth = client.get('P569')
# birth = entity.get(date_of_birth) # TODO: debug this
aut_names = entity.label.texts aut_names = entity.label.texts
_works = entity.get(notable_work) _works = entity.get(notable_work)
_present_in_work = entity.get(present_in_work) _present_in_work = entity.get(present_in_work)
_birth = entity.get(date_of_birth)
if _works is not None: if _works is not None:
for work in _works: for work in _works:
dict_works[work.id] = work.label.texts dict_works[work.id] = work.label.texts
if _present_in_work is not None: if _present_in_work is not None:
for p_work in _present_in_work: for p_work in _present_in_work:
dict_present_in_works[p_work.id] = p_work.label.texts dict_present_in_works[p_work.id] = p_work.label.texts
return entity, aut_names, dict_works, dict_present_in_works if _birth is not None:
_birth = _birth[0]
return entity, aut_names, dict_works, dict_present_in_works, _birth
def print_results(results): def print_results(results):
@ -73,7 +78,7 @@ def extract_wikidata_endpoint(author_names, show_warnings=True):
return endpoint return endpoint
except IndexError: except IndexError:
if show_warnings: if show_warnings:
warnings.warn('Entity has not a wikimdata endpoint ') warnings.warn('Entity has not a wikidata endpoint ')
return None return None
@ -94,12 +99,13 @@ for auth in tqdm.tqdm(full_auth_list):
wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False) wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
dict_res[wikidata_endp] = None dict_res[wikidata_endp] = None
if wikidata_endp is not None: if wikidata_endp is not None:
_, names, works, other_works = testing_wikidata(wikidata_endp) _, names, works, other_works, y_birth = testing_wikidata(wikidata_endp)
dict_res[wikidata_endp] = {'aut_name': names, dict_res[wikidata_endp] = {'aut_name': names,
'aut_works': works, 'aut_works': works,
'aut_present_work': other_works} 'aut_present_work': other_works,
'birth': y_birth}
with open('knowledge_base/KB_wikimedia.json', 'w+') as f: with open('knowledge_base/KB_wikimedia_with_dates.json', 'w+') as f:
json.dump(dict_res, f) json.dump(dict_res, f)
print(f'# Process finished in: {round((time.time()-stime), 5)}') print(f'# Process finished in: {round((time.time()-stime), 5)}')

View File

@ -30,7 +30,7 @@ with open('./KB_abs_merged.pickle', 'wb') as infile:
from pprint import pprint from pprint import pprint
pprint(merged['Giles_of_Rome']) pprint(merged['Giles_of_Rome'])
""" """
with open('./KB_abs_merged.pickle', 'rb') as infile: with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
kb = pickle.load(infile) kb = pickle.load(infile)
reversed_dict = {} reversed_dict = {}
@ -45,5 +45,5 @@ for key in kb.keys():
print(len(reversed_dict)) print(len(reversed_dict))
with open('./KB_abs_reversed.pickle', 'wb') as outfile: with open('knowledge_base/KB_abs_reversed.pickle', 'wb') as outfile:
pickle.dump(reversed_dict, outfile) pickle.dump(reversed_dict, outfile)

Binary file not shown.

Binary file not shown.

View File

@ -1,82 +1,62 @@
"""
Should also evaluate IF and HOW actual spaCy KB could be deoloyed in this scenario
https://github.com/seatgeek/fuzzywuzzy?source=post_page---------------------------
"""
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz # TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher from difflib import SequenceMatcher
from pprint import pprint import json
import pickle
class Knowledge_base: class KnowledgeBase:
def __init__(self, kb_path): def __init__(self, kb_path):
with open(kb_path, 'rb') as infile: with open(kb_path, 'rb') as infile:
data = pickle.load(infile) data = json.load(infile)
self.kb = data self.id2aut = data
#self.utt2ent = self._generate_utter_2_ent() self.aut2id = {}
self._popolate_aut2id()
def link_entities(self, preds, deepfuzz=False):
def link_entities(self, preds): PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
print('-'*50)
print(f'Candidate authors (i.e., entitites matched): {PER_preds}') print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
# print(f'Candidates work:\n{WORK_preds}') # print(f'Candidates work:\n{WORK_preds}')
COMMEDIA_DATE = 1321 COMMEDIA_DATE = 1321
print('-'*50 + '\nChecking in KB...')
""" # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
for target in set(PER_preds):
if target in self.utt2ent.keys():
print(target, self.utt2ent[target])
"""
print('#'*50 + '\nChecking in KB...')
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'aquino)
for target in set(PER_preds): for target in set(PER_preds):
scores = [] scores = []
for auth in self.kb.keys(): deepscore = []
for auth in self.aut2id.keys():
sim = self._similar(target, auth) sim = self._similar(target, auth)
scores.append((auth, sim)) scores.append((auth, sim))
scores.sort(key=lambda tup: tup[1], reverse=True) scores.sort(key=lambda tup: tup[1], reverse=True)
success = False
for i in range(3): for i in range(3):
if scores[i][1] > .8: if scores[i][1] > .8:
print(f'Prediction: {target} - {scores[i]} - born in {self.kb[scores[i][0]]["birth"]}') print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
success = True
break
if deepfuzz and not success:
for aut in self.aut2id.keys():
_splitname = aut.split(' ')
sim = 0
for split in _splitname:
_sim = self._similar(target, split)
if _sim > sim:
sim = _sim
deepscore.append((aut, sim))
deepscore.sort(key=lambda tup: tup[1], reverse=True)
for j in range(3):
if deepscore[j][1] > .8:
print(
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
break break
#elif scores[0][1] == 0:
# print(f'Author {target} not in KB ')
return 0 return 0
"""
for target in set(PER_preds):
#print(f'TARGET: {target}')
scores = []
for auth in self.kb.keys():
sim = self._similar(target, auth)
scores.append((auth, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
# pprint(scores[:3])
all_lang_scores = self._check_other_lang(scores[0], target)
if all_lang_scores[0][1] >= 0.8: # with this threshold 'Tommaso' is not linked to 'Tommaso d'aquino' ...
print(f'TARGET: {target}')
print(f'{all_lang_scores[0][0]} was born in year: {self.kb[scores[0][0]]["birth"]}')
#print(all_lang_scores)
else:
continue
#print('Author not in KB')
print('-'*15)
"""
def _generate_utter_2_ent(self): def _generate_utter_2_ent(self):
utt_2_ent = {} utt_2_ent = {}
for ent_en in self.kb.keys(): for ent_en in self.kb.keys():
@ -84,7 +64,6 @@ class Knowledge_base:
utt_2_ent[utt] = ent_en utt_2_ent[utt] = ent_en
return utt_2_ent return utt_2_ent
def _check_other_lang(self, target, original_name): def _check_other_lang(self, target, original_name):
other_names = self.kb[target[0]]['names'] other_names = self.kb[target[0]]['names']
@ -97,3 +76,12 @@ class Knowledge_base:
def _similar(self,a, b): def _similar(self,a, b):
return SequenceMatcher(None, a, b).ratio() return SequenceMatcher(None, a, b).ratio()
def _popolate_aut2id(self):
for qid, values in self.id2aut.items():
if values is not None:
l_names = set(values['aut_name'].values())
for name in l_names:
self.aut2id[name] = qid
return self

View File

@ -6,12 +6,13 @@ import numpy as np
from tqdm import tqdm from tqdm import tqdm
#with open('./KB_abs_reversed.pickle', 'rb') as infile: #with open('./KB_abs_reversed.pickle', 'rb') as infile:
with open('./KB_abs_merged.pickle', 'rb') as infile: with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
entities_dict = pickle.load(infile) entities_dict = pickle.load(infile)
print(f'Number of entities in original knowledge Base: {len(entities_dict)}') print(f'Number of entities in original knowledge Base: {len(entities_dict)}')
#print(entities_dict.keys()) #print(entities_dict.keys())
def load_word_vectors(model, path_to_vec, max_vec=100000): def load_word_vectors(model, path_to_vec, max_vec=100000):
with open(path_to_vec, 'r') as infile: with open(path_to_vec, 'r') as infile:
header = infile.readline() header = infile.readline()
@ -31,6 +32,7 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
return model return model
def generate_IDs(entities_dict_keys): def generate_IDs(entities_dict_keys):
""" """
Entities dictionary keys are english spelled names (if such an entities is Entities dictionary keys are english spelled names (if such an entities is

24
main.py
View File

@ -4,7 +4,7 @@ import numpy as np
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding
import warnings import warnings
from preprocessing.ner_dataset_builder import DataSetBuilder from preprocessing.ner_dataset_builder import DataSetBuilder
from entity_linker.knowledge_base import Knowledge_base from entity_linker.knowledge_base import KnowledgeBase
from tqdm import tqdm from tqdm import tqdm
from pathlib import Path from pathlib import Path
import pickle import pickle
@ -18,11 +18,10 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv') df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'): def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
model = spacy.load(SPACY_MODEL_STD) model = spacy.load(SPACY_MODEL_STD)
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
TRAIN_DATA = TRAIN_DATA
ner = model.get_pipe('ner') ner = model.get_pipe('ner')
@ -64,7 +63,6 @@ def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPAC
i += 100 i += 100
print(comment[i:len(comment)]) print(comment[i:len(comment)])
disabled.restore() disabled.restore()
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -100,7 +98,7 @@ def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPAC
print(gold) print(gold)
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
save_model(model, 'it_dante', output_dir) save_model(model, 'it_dante_new', output_dir)
return model return model
@ -130,11 +128,11 @@ def predict_candidates(model, comment, labels=None):
if labels is not None: if labels is not None:
query = comment query = comment
gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']] gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
print(f'{len(gold)} GOLD TARGETS ' + '#'*50) print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
for i in range(len(gold)): for i in range(len(gold)):
elem = gold.iloc[i] elem = gold.iloc[i]
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}') print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
print('\n') # print('\n')
return candidates, gold return candidates, gold
@ -173,28 +171,28 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
def main(): def main():
df_TRAIN = df_monarchia df_TRAIN = df_monarchia
df_eval = df_convivio df_eval = df_convivio
# df_eval = df_monarchia
dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique) dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json') TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
commentaries_convivio_eva = dataset.clean_commentaries_eva commentaries_convivio_eva = dataset.clean_commentaries_eva
commentaries_monarchia = dataset.clean_commentaries commentaries_monarchia = dataset.clean_commentaries
raw_commentaries_convivio = dataset.commentaries_eva raw_commentaries_convivio = dataset.commentaries_eva
# train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
# nlp = spacy.load('it_core_news_sm') # nlp = spacy.load('it_core_news_sm')
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000) # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
#nlp = train_model(nlp, TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/')
# dataset_convivio = DataSetBuilder(df_eval, df_eval) # dataset_convivio = DataSetBuilder(df_eval, df_eval)
# dataset_convivio.export_dataset_doccano('std_convivio') # dataset_convivio.export_dataset_doccano('std_convivio')
nlp = spacy.load('./model_fastText/') nlp = spacy.load('./model_fastText/model_spacy_latest')
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000) # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
# print(len(list(nlp.vocab.strings))) # get whole model vocabulary # print(len(list(nlp.vocab.strings))) # get whole model vocabulary
seed = random.randint(1, len(commentaries_convivio_eva)) seed = random.randint(1, len(commentaries_convivio_eva))
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval) preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
kb = Knowledge_base('./entity_linker/KB_abs_reversed.pickle') kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
kb.link_entities(preds) kb.link_entities(preds, deepfuzz=True)
print(f'\nComment Numbert: {seed}')
exit() exit()

View File

@ -14,6 +14,7 @@ COMMENTARIES_PATH = './commentaries/'
DF_COMMENTARIES_PATH = './commentaries/data_parsed/' DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
df_commentary_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv') df_commentary_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv') df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
""" """
df_ner_unique ATM contains <i>terms</i> found in "De Monarchia". The .csv file should df_ner_unique ATM contains <i>terms</i> found in "De Monarchia". The .csv file should
contain all the occurrences of tagged terms across all of the (tagged) documents! contain all the occurrences of tagged terms across all of the (tagged) documents!