import spacy from spacy.kb import KnowledgeBase import pickle from pprint import pprint import numpy as np from tqdm import tqdm #with open('./KB_abs_reversed.pickle', 'rb') as infile: with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile: entities_dict = pickle.load(infile) print(f'Number of entities in original knowledge Base: {len(entities_dict)}') #print(entities_dict.keys()) def load_word_vectors(model, path_to_vec, max_vec=100000): with open(path_to_vec, 'r') as infile: header = infile.readline() n_row, n_dim = header.split() model.vocab.reset_vectors(width=int(n_dim)) count = 0 for _, line in tqdm(enumerate(infile), total=max_vec): count += 1 line = line.rstrip() pieces = line.rsplit(' ', int(n_dim)) word = pieces[0] #print("{} - {}".format(count, word)) vector = np.asarray([float(v) for v in pieces[1:]], dtype='f') model.vocab.set_vector(word, vector) if count == max_vec: break return model def generate_IDs(entities_dict_keys): """ Entities dictionary keys are english spelled names (if such an entities is present in DBpedia - otherwise it is its italian spelling) """ IDs_2_ent = {'Q'+str(i+1) : name for i, name in enumerate(entities_dict_keys)} IDs_2_abs = {} for ID, ent in IDs_2_ent.items(): IDs_2_abs[ID] = entities_dict[ent]['abstract'] return IDs_2_ent, IDs_2_abs IDs_2_ent, IDs_2_abs = generate_IDs(entities_dict.keys()) reverse_id = {v:k for k,v in IDs_2_ent.items()} #print(IDs_2_ent) #print(entities_dict['Nicholas_of_Lyra']['names']) #print(reverse_id) nlp = spacy.load('../model_fastText/') nlp = load_word_vectors(nlp, '../embeddings/cc.it.300.vec', 50000) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300) for qid in IDs_2_ent.keys(): desc = nlp(IDs_2_abs[qid]) desc_enc = desc.vector kb.add_entity(entity=qid, entity_vector=desc_enc, freq=314) for qid, name in IDs_2_ent.items(): kb.add_alias(alias=name, entities=[qid], probabilities=[1]) other_lang_names = entities_dict[name]['names'] if other_lang_names != 'NA': for other_name in set(other_lang_names): if other_name != name: kb.add_alias(alias=other_name, entities=[qid], probabilities=[1]) #kb.dump('./kb_fastText_2') """ nlp = spacy.load('../model_fastText/') nlp = load_word_vectors(nlp, '../embeddings/cc.it.300.vec', 50000) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300) """ desc = """Proveniente da una famiglia plebea legata alla nobilitas municipale, compì a Roma il cursus honorum, divenendo prima questore, poi tribuno della plebe ed infine senatore della res publica. Dopo esser stato cacciato dal Senato per indegnità morale, partecipò alla guerra civile del 49 a.C. tra Cesare e Pompeo, schierato tra le file cesariane. Dopo la sconfitta di Pompeo, Cesare lo ricompensò per la sua fedeltà conferendogli la pretura, riammettendolo in Senato e nominandolo governatore della provincia dell'Africa Nova""" """ kb.add_entity(entity='Q1', entity_vector=nlp(desc).vector, freq=314) kb.add_alias(alias='Salustio', entities=['Q1'], probabilities=[1]) kb.add_alias(alias='Sallustio', entities=['Q1'], probabilities=[1]) """ print(f'Cand for "Ricardo de San Víctor": {[c.entity_ for c in kb.get_candidates("Tommaso")]}') print(f'Cand for "Riccardo di San Vittore": {[c.entity_ for c in kb.get_candidates("Thomas_Aquinas")]}')