parent
d95e596ee8
commit
fb84b36b90
|
|
@ -8,6 +8,7 @@
|
||||||
/commentaries/*.xml
|
/commentaries/*.xml
|
||||||
/commentaries/*.xsd
|
/commentaries/*.xsd
|
||||||
/commentaries/*.zip
|
/commentaries/*.zip
|
||||||
|
/entity_linker/knowledge_base/*.pickle
|
||||||
|
|
||||||
# User-specific stuff
|
# User-specific stuff
|
||||||
.idea/**/workspace.xml
|
.idea/**/workspace.xml
|
||||||
|
|
|
||||||
|
|
@ -39,18 +39,23 @@ def testing_wikidata(entity_q):
|
||||||
entity = client.get(entity_q, load=True)
|
entity = client.get(entity_q, load=True)
|
||||||
notable_work = client.get('P800')
|
notable_work = client.get('P800')
|
||||||
present_in_work = client.get('P1441')
|
present_in_work = client.get('P1441')
|
||||||
# date_of_birth = client.get('P569')
|
date_of_birth = client.get('P569')
|
||||||
# birth = entity.get(date_of_birth) # TODO: debug this
|
|
||||||
aut_names = entity.label.texts
|
aut_names = entity.label.texts
|
||||||
_works = entity.get(notable_work)
|
_works = entity.get(notable_work)
|
||||||
_present_in_work = entity.get(present_in_work)
|
_present_in_work = entity.get(present_in_work)
|
||||||
|
_birth = entity.get(date_of_birth)
|
||||||
|
|
||||||
if _works is not None:
|
if _works is not None:
|
||||||
for work in _works:
|
for work in _works:
|
||||||
dict_works[work.id] = work.label.texts
|
dict_works[work.id] = work.label.texts
|
||||||
if _present_in_work is not None:
|
if _present_in_work is not None:
|
||||||
for p_work in _present_in_work:
|
for p_work in _present_in_work:
|
||||||
dict_present_in_works[p_work.id] = p_work.label.texts
|
dict_present_in_works[p_work.id] = p_work.label.texts
|
||||||
return entity, aut_names, dict_works, dict_present_in_works
|
if _birth is not None:
|
||||||
|
_birth = _birth[0]
|
||||||
|
|
||||||
|
return entity, aut_names, dict_works, dict_present_in_works, _birth
|
||||||
|
|
||||||
|
|
||||||
def print_results(results):
|
def print_results(results):
|
||||||
|
|
@ -73,7 +78,7 @@ def extract_wikidata_endpoint(author_names, show_warnings=True):
|
||||||
return endpoint
|
return endpoint
|
||||||
except IndexError:
|
except IndexError:
|
||||||
if show_warnings:
|
if show_warnings:
|
||||||
warnings.warn('Entity has not a wikimdata endpoint ')
|
warnings.warn('Entity has not a wikidata endpoint ')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -94,12 +99,13 @@ for auth in tqdm.tqdm(full_auth_list):
|
||||||
wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
|
wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
|
||||||
dict_res[wikidata_endp] = None
|
dict_res[wikidata_endp] = None
|
||||||
if wikidata_endp is not None:
|
if wikidata_endp is not None:
|
||||||
_, names, works, other_works = testing_wikidata(wikidata_endp)
|
_, names, works, other_works, y_birth = testing_wikidata(wikidata_endp)
|
||||||
dict_res[wikidata_endp] = {'aut_name': names,
|
dict_res[wikidata_endp] = {'aut_name': names,
|
||||||
'aut_works': works,
|
'aut_works': works,
|
||||||
'aut_present_work': other_works}
|
'aut_present_work': other_works,
|
||||||
|
'birth': y_birth}
|
||||||
|
|
||||||
with open('knowledge_base/KB_wikimedia.json', 'w+') as f:
|
with open('knowledge_base/KB_wikimedia_with_dates.json', 'w+') as f:
|
||||||
json.dump(dict_res, f)
|
json.dump(dict_res, f)
|
||||||
|
|
||||||
print(f'# Process finished in: {round((time.time()-stime), 5)}')
|
print(f'# Process finished in: {round((time.time()-stime), 5)}')
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ with open('./KB_abs_merged.pickle', 'wb') as infile:
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
pprint(merged['Giles_of_Rome'])
|
pprint(merged['Giles_of_Rome'])
|
||||||
"""
|
"""
|
||||||
with open('./KB_abs_merged.pickle', 'rb') as infile:
|
with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
|
||||||
kb = pickle.load(infile)
|
kb = pickle.load(infile)
|
||||||
|
|
||||||
reversed_dict = {}
|
reversed_dict = {}
|
||||||
|
|
@ -45,5 +45,5 @@ for key in kb.keys():
|
||||||
|
|
||||||
print(len(reversed_dict))
|
print(len(reversed_dict))
|
||||||
|
|
||||||
with open('./KB_abs_reversed.pickle', 'wb') as outfile:
|
with open('knowledge_base/KB_abs_reversed.pickle', 'wb') as outfile:
|
||||||
pickle.dump(reversed_dict, outfile)
|
pickle.dump(reversed_dict, outfile)
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -1,82 +1,62 @@
|
||||||
"""
|
|
||||||
Should also evaluate IF and HOW actual spaCy KB could be deoloyed in this scenario
|
|
||||||
|
|
||||||
https://github.com/seatgeek/fuzzywuzzy?source=post_page---------------------------
|
|
||||||
"""
|
|
||||||
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
|
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
|
||||||
|
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from pprint import pprint
|
import json
|
||||||
import pickle
|
|
||||||
|
|
||||||
|
|
||||||
class Knowledge_base:
|
class KnowledgeBase:
|
||||||
|
|
||||||
def __init__(self, kb_path):
|
def __init__(self, kb_path):
|
||||||
with open(kb_path, 'rb') as infile:
|
with open(kb_path, 'rb') as infile:
|
||||||
data = pickle.load(infile)
|
data = json.load(infile)
|
||||||
|
|
||||||
self.kb = data
|
self.id2aut = data
|
||||||
#self.utt2ent = self._generate_utter_2_ent()
|
self.aut2id = {}
|
||||||
|
self._popolate_aut2id()
|
||||||
|
|
||||||
|
def link_entities(self, preds, deepfuzz=False):
|
||||||
def link_entities(self, preds):
|
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
|
||||||
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER']
|
|
||||||
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
|
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
|
||||||
|
print('-'*50)
|
||||||
print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
|
print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
|
||||||
# print(f'Candidates work:\n{WORK_preds}')
|
# print(f'Candidates work:\n{WORK_preds}')
|
||||||
|
|
||||||
COMMEDIA_DATE = 1321
|
COMMEDIA_DATE = 1321
|
||||||
|
print('-'*50 + '\nChecking in KB...')
|
||||||
|
|
||||||
"""
|
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
|
||||||
for target in set(PER_preds):
|
|
||||||
if target in self.utt2ent.keys():
|
|
||||||
print(target, self.utt2ent[target])
|
|
||||||
"""
|
|
||||||
print('#'*50 + '\nChecking in KB...')
|
|
||||||
|
|
||||||
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'aquino)
|
|
||||||
|
|
||||||
for target in set(PER_preds):
|
for target in set(PER_preds):
|
||||||
scores = []
|
scores = []
|
||||||
for auth in self.kb.keys():
|
deepscore = []
|
||||||
|
for auth in self.aut2id.keys():
|
||||||
sim = self._similar(target, auth)
|
sim = self._similar(target, auth)
|
||||||
scores.append((auth, sim))
|
scores.append((auth, sim))
|
||||||
|
|
||||||
scores.sort(key=lambda tup: tup[1], reverse=True)
|
scores.sort(key=lambda tup: tup[1], reverse=True)
|
||||||
|
success = False
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
if scores[i][1] > .8:
|
if scores[i][1] > .8:
|
||||||
print(f'Prediction: {target} - {scores[i]} - born in {self.kb[scores[i][0]]["birth"]}')
|
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
|
||||||
|
success = True
|
||||||
|
break
|
||||||
|
if deepfuzz and not success:
|
||||||
|
for aut in self.aut2id.keys():
|
||||||
|
_splitname = aut.split(' ')
|
||||||
|
sim = 0
|
||||||
|
for split in _splitname:
|
||||||
|
_sim = self._similar(target, split)
|
||||||
|
if _sim > sim:
|
||||||
|
sim = _sim
|
||||||
|
deepscore.append((aut, sim))
|
||||||
|
deepscore.sort(key=lambda tup: tup[1], reverse=True)
|
||||||
|
for j in range(3):
|
||||||
|
if deepscore[j][1] > .8:
|
||||||
|
print(
|
||||||
|
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
|
||||||
break
|
break
|
||||||
#elif scores[0][1] == 0:
|
|
||||||
# print(f'Author {target} not in KB ')
|
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
"""
|
|
||||||
for target in set(PER_preds):
|
|
||||||
#print(f'TARGET: {target}')
|
|
||||||
|
|
||||||
scores = []
|
|
||||||
for auth in self.kb.keys():
|
|
||||||
sim = self._similar(target, auth)
|
|
||||||
scores.append((auth, sim))
|
|
||||||
|
|
||||||
scores.sort(key=lambda tup: tup[1], reverse=True)
|
|
||||||
# pprint(scores[:3])
|
|
||||||
|
|
||||||
all_lang_scores = self._check_other_lang(scores[0], target)
|
|
||||||
|
|
||||||
if all_lang_scores[0][1] >= 0.8: # with this threshold 'Tommaso' is not linked to 'Tommaso d'aquino' ...
|
|
||||||
print(f'TARGET: {target}')
|
|
||||||
print(f'{all_lang_scores[0][0]} was born in year: {self.kb[scores[0][0]]["birth"]}')
|
|
||||||
#print(all_lang_scores)
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
#print('Author not in KB')
|
|
||||||
print('-'*15)
|
|
||||||
|
|
||||||
"""
|
|
||||||
def _generate_utter_2_ent(self):
|
def _generate_utter_2_ent(self):
|
||||||
utt_2_ent = {}
|
utt_2_ent = {}
|
||||||
for ent_en in self.kb.keys():
|
for ent_en in self.kb.keys():
|
||||||
|
|
@ -84,7 +64,6 @@ class Knowledge_base:
|
||||||
utt_2_ent[utt] = ent_en
|
utt_2_ent[utt] = ent_en
|
||||||
return utt_2_ent
|
return utt_2_ent
|
||||||
|
|
||||||
|
|
||||||
def _check_other_lang(self, target, original_name):
|
def _check_other_lang(self, target, original_name):
|
||||||
other_names = self.kb[target[0]]['names']
|
other_names = self.kb[target[0]]['names']
|
||||||
|
|
||||||
|
|
@ -97,3 +76,12 @@ class Knowledge_base:
|
||||||
|
|
||||||
def _similar(self,a, b):
|
def _similar(self,a, b):
|
||||||
return SequenceMatcher(None, a, b).ratio()
|
return SequenceMatcher(None, a, b).ratio()
|
||||||
|
|
||||||
|
def _popolate_aut2id(self):
|
||||||
|
for qid, values in self.id2aut.items():
|
||||||
|
if values is not None:
|
||||||
|
l_names = set(values['aut_name'].values())
|
||||||
|
for name in l_names:
|
||||||
|
self.aut2id[name] = qid
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,12 +6,13 @@ import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
#with open('./KB_abs_reversed.pickle', 'rb') as infile:
|
#with open('./KB_abs_reversed.pickle', 'rb') as infile:
|
||||||
with open('./KB_abs_merged.pickle', 'rb') as infile:
|
with open('knowledge_base/KB_abs_merged.pickle', 'rb') as infile:
|
||||||
entities_dict = pickle.load(infile)
|
entities_dict = pickle.load(infile)
|
||||||
|
|
||||||
print(f'Number of entities in original knowledge Base: {len(entities_dict)}')
|
print(f'Number of entities in original knowledge Base: {len(entities_dict)}')
|
||||||
#print(entities_dict.keys())
|
#print(entities_dict.keys())
|
||||||
|
|
||||||
|
|
||||||
def load_word_vectors(model, path_to_vec, max_vec=100000):
|
def load_word_vectors(model, path_to_vec, max_vec=100000):
|
||||||
with open(path_to_vec, 'r') as infile:
|
with open(path_to_vec, 'r') as infile:
|
||||||
header = infile.readline()
|
header = infile.readline()
|
||||||
|
|
@ -31,6 +32,7 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def generate_IDs(entities_dict_keys):
|
def generate_IDs(entities_dict_keys):
|
||||||
"""
|
"""
|
||||||
Entities dictionary keys are english spelled names (if such an entities is
|
Entities dictionary keys are english spelled names (if such an entities is
|
||||||
|
|
|
||||||
34
main.py
34
main.py
|
|
@ -4,7 +4,7 @@ import numpy as np
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
import warnings
|
import warnings
|
||||||
from preprocessing.ner_dataset_builder import DataSetBuilder
|
from preprocessing.ner_dataset_builder import DataSetBuilder
|
||||||
from entity_linker.knowledge_base import Knowledge_base
|
from entity_linker.knowledge_base import KnowledgeBase
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pickle
|
import pickle
|
||||||
|
|
@ -18,11 +18,10 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
|
||||||
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
||||||
|
|
||||||
|
|
||||||
def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
|
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
|
||||||
|
|
||||||
model = spacy.load(SPACY_MODEL_STD)
|
model = spacy.load(SPACY_MODEL_STD)
|
||||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||||
TRAIN_DATA = TRAIN_DATA
|
|
||||||
|
|
||||||
ner = model.get_pipe('ner')
|
ner = model.get_pipe('ner')
|
||||||
|
|
||||||
|
|
@ -64,7 +63,6 @@ def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPAC
|
||||||
i += 100
|
i += 100
|
||||||
print(comment[i:len(comment)])
|
print(comment[i:len(comment)])
|
||||||
|
|
||||||
|
|
||||||
disabled.restore()
|
disabled.restore()
|
||||||
|
|
||||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||||
|
|
@ -100,7 +98,7 @@ def train_model(model, TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPAC
|
||||||
print(gold)
|
print(gold)
|
||||||
|
|
||||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||||
save_model(model, 'it_dante', output_dir)
|
save_model(model, 'it_dante_new', output_dir)
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
@ -130,11 +128,11 @@ def predict_candidates(model, comment, labels=None):
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
query = comment
|
query = comment
|
||||||
gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
|
gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
|
||||||
print(f'{len(gold)} GOLD TARGETS ' + '#'*50)
|
print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
|
||||||
for i in range(len(gold)):
|
for i in range(len(gold)):
|
||||||
elem = gold.iloc[i]
|
elem = gold.iloc[i]
|
||||||
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
|
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
|
||||||
print('\n')
|
# print('\n')
|
||||||
|
|
||||||
return candidates, gold
|
return candidates, gold
|
||||||
|
|
||||||
|
|
@ -173,28 +171,28 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
|
||||||
def main():
|
def main():
|
||||||
df_TRAIN = df_monarchia
|
df_TRAIN = df_monarchia
|
||||||
df_eval = df_convivio
|
df_eval = df_convivio
|
||||||
# df_eval = df_monarchia
|
|
||||||
dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
|
dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
|
||||||
TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
|
TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
|
||||||
commentaries_convivio_eva = dataset.clean_commentaries_eva
|
commentaries_convivio_eva = dataset.clean_commentaries_eva
|
||||||
commentaries_monarchia = dataset.clean_commentaries
|
commentaries_monarchia = dataset.clean_commentaries
|
||||||
raw_commentaries_convivio = dataset.commentaries_eva
|
raw_commentaries_convivio = dataset.commentaries_eva
|
||||||
#nlp = spacy.load('it_core_news_sm')
|
|
||||||
#nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
|
||||||
#nlp = train_model(nlp, TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/')
|
|
||||||
#dataset_convivio = DataSetBuilder(df_eval, df_eval)
|
|
||||||
#dataset_convivio.export_dataset_doccano('std_convivio')
|
|
||||||
|
|
||||||
nlp = spacy.load('./model_fastText/')
|
# train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
|
||||||
#nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
# nlp = spacy.load('it_core_news_sm')
|
||||||
|
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
||||||
|
# dataset_convivio = DataSetBuilder(df_eval, df_eval)
|
||||||
|
# dataset_convivio.export_dataset_doccano('std_convivio')
|
||||||
|
|
||||||
#print(len(list(nlp.vocab.strings))) # get whole model vocabulary
|
nlp = spacy.load('./model_fastText/model_spacy_latest')
|
||||||
|
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
||||||
|
# print(len(list(nlp.vocab.strings))) # get whole model vocabulary
|
||||||
|
|
||||||
seed = random.randint(1, len(commentaries_convivio_eva))
|
seed = random.randint(1, len(commentaries_convivio_eva))
|
||||||
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
|
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
|
||||||
|
|
||||||
kb = Knowledge_base('./entity_linker/KB_abs_reversed.pickle')
|
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
|
||||||
kb.link_entities(preds)
|
kb.link_entities(preds, deepfuzz=True)
|
||||||
|
print(f'\nComment Numbert: {seed}')
|
||||||
|
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ COMMENTARIES_PATH = './commentaries/'
|
||||||
DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
|
DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
|
||||||
df_commentary_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
|
df_commentary_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
|
||||||
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
||||||
|
|
||||||
"""
|
"""
|
||||||
df_ner_unique ATM contains <i>terms</i> found in "De Monarchia". The .csv file should
|
df_ner_unique ATM contains <i>terms</i> found in "De Monarchia". The .csv file should
|
||||||
contain all the occurrences of tagged terms across all of the (tagged) documents!
|
contain all the occurrences of tagged terms across all of the (tagged) documents!
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue