95 lines
3.5 KiB
Python
95 lines
3.5 KiB
Python
import spacy
|
|
from spacy.kb import KnowledgeBase
|
|
import pickle
|
|
from pprint import pprint
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
|
|
#with open('./KB_abs_reversed.pickle', 'rb') as infile:
|
|
with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
|
|
entities_dict = pickle.load(infile)
|
|
|
|
print(f'Number of entities in original knowledge Base: {len(entities_dict)}')
|
|
#print(entities_dict.keys())
|
|
|
|
|
|
def load_word_vectors(model, path_to_vec, max_vec=100000):
|
|
with open(path_to_vec, 'r') as infile:
|
|
header = infile.readline()
|
|
n_row, n_dim = header.split()
|
|
model.vocab.reset_vectors(width=int(n_dim))
|
|
count = 0
|
|
for _, line in tqdm(enumerate(infile), total=max_vec):
|
|
count += 1
|
|
line = line.rstrip()
|
|
pieces = line.rsplit(' ', int(n_dim))
|
|
word = pieces[0]
|
|
#print("{} - {}".format(count, word))
|
|
vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
|
|
model.vocab.set_vector(word, vector)
|
|
if count == max_vec:
|
|
break
|
|
|
|
return model
|
|
|
|
|
|
def generate_IDs(entities_dict_keys):
|
|
"""
|
|
Entities dictionary keys are english spelled names (if such an entities is
|
|
present in DBpedia - otherwise it is its italian spelling)
|
|
"""
|
|
IDs_2_ent = {'Q'+str(i+1) : name for i, name in enumerate(entities_dict_keys)}
|
|
IDs_2_abs = {}
|
|
for ID, ent in IDs_2_ent.items():
|
|
IDs_2_abs[ID] = entities_dict[ent]['abstract']
|
|
return IDs_2_ent, IDs_2_abs
|
|
|
|
IDs_2_ent, IDs_2_abs = generate_IDs(entities_dict.keys())
|
|
reverse_id = {v:k for k,v in IDs_2_ent.items()}
|
|
|
|
#print(IDs_2_ent)
|
|
#print(entities_dict['Nicholas_of_Lyra']['names'])
|
|
#print(reverse_id)
|
|
|
|
nlp = spacy.load('../model_fastText/')
|
|
nlp = load_word_vectors(nlp, '../embeddings/cc.it.300.vec', 50000)
|
|
|
|
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)
|
|
|
|
for qid in IDs_2_ent.keys():
|
|
desc = nlp(IDs_2_abs[qid])
|
|
desc_enc = desc.vector
|
|
kb.add_entity(entity=qid, entity_vector=desc_enc, freq=314)
|
|
|
|
for qid, name in IDs_2_ent.items():
|
|
kb.add_alias(alias=name, entities=[qid], probabilities=[1])
|
|
other_lang_names = entities_dict[name]['names']
|
|
if other_lang_names != 'NA':
|
|
for other_name in set(other_lang_names):
|
|
if other_name != name:
|
|
kb.add_alias(alias=other_name, entities=[qid], probabilities=[1])
|
|
|
|
#kb.dump('./kb_fastText_2')
|
|
|
|
"""
|
|
|
|
nlp = spacy.load('../model_fastText/')
|
|
nlp = load_word_vectors(nlp, '../embeddings/cc.it.300.vec', 50000)
|
|
|
|
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)
|
|
"""
|
|
|
|
desc = """Proveniente da una famiglia plebea legata alla nobilitas municipale, compì a Roma il cursus honorum, divenendo prima questore, poi tribuno della plebe ed infine
|
|
senatore della res publica. Dopo esser stato cacciato dal Senato per indegnità morale, partecipò alla guerra civile del 49 a.C. tra Cesare e Pompeo, schierato tra
|
|
le file cesariane. Dopo la sconfitta di Pompeo, Cesare lo ricompensò per la sua fedeltà conferendogli la pretura, riammettendolo in Senato e nominandolo governatore
|
|
della provincia dell'Africa Nova"""
|
|
|
|
"""
|
|
kb.add_entity(entity='Q1', entity_vector=nlp(desc).vector, freq=314)
|
|
|
|
kb.add_alias(alias='Salustio', entities=['Q1'], probabilities=[1])
|
|
|
|
kb.add_alias(alias='Sallustio', entities=['Q1'], probabilities=[1])
|
|
"""
|
|
print(f'Cand for "Ricardo de San Víctor": {[c.entity_ for c in kb.get_candidates("Tommaso")]}')
|
|
print(f'Cand for "Riccardo di San Vittore": {[c.entity_ for c in kb.get_candidates("Thomas_Aquinas")]}') |