function to extend KB with data from annotated commentaries

This commit is contained in:
andrea 2020-10-23 16:59:01 +02:00
parent fb84b36b90
commit 2324ddff9f
5 changed files with 117 additions and 103 deletions

View File

@ -1,30 +1,33 @@
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher
import json
import numpy as np
class KnowledgeBase:
def __init__(self, kb_path):
def __init__(self, kb_path, extension=None):
with open(kb_path, 'rb') as infile:
data = json.load(infile)
self.id2aut = data
self.aut2id = {}
self.works2aut = {}
self._popolate_aut2id()
if extension is not None:
self._extend_kb(extension)
self._popolate_aut2id()
def link_entities(self, preds, deepfuzz=False):
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
print('-'*50)
print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
# print(f'Candidates work:\n{WORK_preds}')
# print('-'*50)
# print(f'Candidate authors (i.e., entities matched): {PER_preds}')
# print(f'Candidates work :{WORK_preds}')
COMMEDIA_DATE = 1321
print('-'*50 + '\nChecking in KB...')
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
aut_res = []
for target in set(PER_preds):
scores = []
deepscore = []
@ -36,8 +39,10 @@ class KnowledgeBase:
success = False
for i in range(3):
if scores[i][1] > .8:
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
#, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
success = True
aut_res.append(target)
break
if deepfuzz and not success:
for aut in self.aut2id.keys():
@ -50,38 +55,74 @@ class KnowledgeBase:
deepscore.append((aut, sim))
deepscore.sort(key=lambda tup: tup[1], reverse=True)
for j in range(3):
if deepscore[j][1] > .8:
if deepscore[j][1] > .9:
print(
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
#, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
aut_res.append(target)
break
return 0
def _generate_utter_2_ent(self):
utt_2_ent = {}
for ent_en in self.kb.keys():
for utt in self.kb[ent_en]['names']:
utt_2_ent[utt] = ent_en
return utt_2_ent
work_res = {}
if len(WORK_preds) != 0:
print('-' * 50 + '\n### Works matches:')
for target in set(WORK_preds):
scores_work = []
for work in self.works2aut.keys():
sim = self._similar(target, work)
scores_work.append((work, sim))
scores_work.sort(key=lambda tup: tup[1], reverse=True)
for i in range(3):
if scores_work[i][1] > .75:
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
work_res[target] = self.works2aut[scores_work[i][0]]
break
def _check_other_lang(self, target, original_name):
other_names = self.kb[target[0]]['names']
scores = []
for name in other_names:
sim = self._similar(original_name, name)
scores.append((name, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
return scores
return aut_res, work_res
def _similar(self,a, b):
return SequenceMatcher(None, a, b).ratio()
def _popolate_aut2id(self):
for qid, values in self.id2aut.items():
if qid == 'null':
continue
if values is not None:
l_names = set(values['aut_name'].values())
for name in l_names:
self.aut2id[name] = qid
works = values['aut_works']
if len(works) != 0:
for wid, wvalues in works.items():
try:
self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
except:
continue
return self
def _extend_kb(self, df):
_qid = 0
prev_work = ''
for i in range(len(df)):
row = df.iloc[i]
auth = row.quot_author
work = row.quot_title
if auth is not np.nan and work is not np.nan:
if work != prev_work:
try:
qid = self.aut2id[auth]
new_wid = f'W{_qid}'
_qid += 1
self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
prev_work = work
except:
new_qid = f'Q{str(_qid)}'
new_wid = f'W{str(_qid)}'
_qid += 1
self.id2aut[new_qid] = {'aut_name': {'it': auth},
'aut_works': {new_wid: {'it': work}},
'aut_present_work': {},
'birth': 0}
prev_work = work
else:
continue

86
main.py
View File

@ -5,9 +5,8 @@ from spacy.util import minibatch, compounding
import warnings
from preprocessing.ner_dataset_builder import DataSetBuilder
from entity_linker.knowledge_base import KnowledgeBase
from tqdm import tqdm
from tqdm import tqdm, trange
from pathlib import Path
import pickle
import random
@ -18,7 +17,16 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
def pprint_com(comment, l=100):
i = 0
while len(comment) > i + 100:
j = i + l
print(comment[i:j])
i += 100
print(comment[i:len(comment)])
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'):
model = spacy.load(SPACY_MODEL_STD)
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -36,35 +44,22 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
# warnings.filterwarnings("once", category=UserWarning, module='spacy')
optimizer = model.resume_training()
n_epochs = 7
#batch_size = 32
n_epochs = nepochs
print(f'\n## Begin Training')
for i in tqdm(range(n_epochs), desc='Iter'):
#print(f'Iteration {i+1}')
t = trange(n_epochs, desc='Iter')
for i in t:
losses = {}
random.shuffle(TRAIN_DATA)
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
docs, golds = zip(*batch)
model.update(docs, golds, sgd=optimizer, losses=losses)
t.set_description(f'NER loss: {round(losses["ner"], 5)}')
print(f'Final loss: {losses}')
seed = random.randint(1, len(clean_commentaries))
def pprint_com(comment, l=100):
i = 0
while len(comment) > i+100:
j = i+l
print(comment[i:j])
i += 100
print(comment[i:len(comment)])
disabled.restore()
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
eg_eval = df_eval.iloc[seed]
@ -80,21 +75,11 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
for ent in doc.ents:
print(ent.text, ent.label_)
"""
print('\n')
print('-'*50)
print('STANDARD NER MODEL PREDICTIONS:')
nlp_reloaded = spacy.load('it_core_news_sm')
doc_STD = nlp_reloaded(clean_comment)
for ent in doc_STD.ents:
print(ent.text, ent.label_)
"""
print('\n')
print('-'*50)
print('GOLD:')
query = eg_eval['comment']
gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']]
print(gold)
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -104,15 +89,6 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
def predict_candidates(model, comment, labels=None):
def pprint_com(comment, l=100):
i = 0
while len(comment) > i+100:
j = i+l
print(comment[i:j])
i += 100
print(comment[i:len(comment)])
clean_comment = comment.replace('<i>', '')
clean_comment = clean_comment.replace('</i>', '')
clean_comment = clean_comment.replace('\\', '')
@ -123,7 +99,6 @@ def predict_candidates(model, comment, labels=None):
print('\n')
candidates = [(ent.text, ent.label_) for ent in doc.ents]
#print(candidates)
if labels is not None:
query = comment
@ -131,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
for i in range(len(gold)):
elem = gold.iloc[i]
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
# print('\n')
return candidates, gold
@ -168,6 +143,13 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
return model
def connect_aut_work(list_aut, list_work, kb):
print('\n\nTODO')
# qid_list = [kb.aut2id[author] for author in list_aut]
# wid_list = [kb.works2aut[work] for work in list_work]
# print('lel')
def main():
df_TRAIN = df_monarchia
df_eval = df_convivio
@ -178,21 +160,21 @@ def main():
raw_commentaries_convivio = dataset.commentaries_eva
# train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
# nlp = spacy.load('it_core_news_sm')
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
# dataset_convivio = DataSetBuilder(df_eval, df_eval)
# dataset_convivio.export_dataset_doccano('std_convivio')
nlp = spacy.load('./model_fastText/model_spacy_latest')
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
# print(len(list(nlp.vocab.strings))) # get whole model vocabulary
seed = random.randint(1, len(commentaries_convivio_eva))
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
kb.link_entities(preds, deepfuzz=True)
print(f'\nComment Numbert: {seed}')
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
# Testing ------------------------
# connect_aut_work(aut_res, work_res, kb)
# --------------------------------
print(f'\nComment Number: {seed}')
# TODO: add a matcher that returns s_char and end_char of the matched entities!
exit()

View File

@ -92,7 +92,6 @@ class DataSetBuilder:
self.ner_clean_lookup = ner_clean_lookup
return ner_lookup, ner_clean_lookup
def _annotate_commentaries(self):
"""
@ -124,7 +123,6 @@ class DataSetBuilder:
matches_in_clean_commentaries.append(res)
return matches_in_commentaries, matches_in_clean_commentaries
def build_train_data(self):
from collections import OrderedDict
@ -142,7 +140,6 @@ class DataSetBuilder:
self.TRAIN_DATA = TRAIN_DATA
return TRAIN_DATA
def get_rehearsal_data(self):
revision_data = []
print('# NB: TAGGING WITH standard spacy model!')
@ -155,13 +152,12 @@ class DataSetBuilder:
self.revision_data = revision_data
return revision_data
def train_model(self):
from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/
import random
nlp_std = spacy.load(self.SPACY_MODEL_STD)
#revision_data = self.get_rehearsal_data()
#REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150]
@ -178,10 +174,10 @@ class DataSetBuilder:
other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions]
optimizer = nlp_std.resume_training()
with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings():
warnings.filterwarnings("once", category=UserWarning, module='spacy')
n_epochs = 10
#batch_size = 32
print(f'\n## Begin Training')
@ -191,9 +187,7 @@ class DataSetBuilder:
random.shuffle(REHEARSAL_DATA)
batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
#for batch in partition_all(batch_size, REHEARSAL_DATA):
docs, golds = zip(*batch)
#texts, annotations = zip(*batch)
nlp_std.update(docs, golds, sgd=optimizer, losses=losses)
print(f'loss: {losses}')
@ -221,7 +215,6 @@ class DataSetBuilder:
for ent in doc_STD.ents:
print(ent.text, ent.label_)
def export_dataset_doccano(self, outputfile_name):
"""
Doccano JSONL data format:
@ -239,7 +232,6 @@ class DataSetBuilder:
with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer:
writer.write_all(output)
def merge_rehearsed(self, data, revision_data):
res = []
revision_data = revision_data
@ -258,7 +250,6 @@ class DataSetBuilder:
return res
def import_dataset_doccano(self, path):
data = []
with open(path) as infile:
@ -266,20 +257,20 @@ class DataSetBuilder:
for line in content:
json_data = json.loads(line)
ent = {'entities':[]}
ent = {'entities': []}
ent['entities'] = json_data['labels']
data.append((json_data['text'], ent))
self.TRAIN_DATA = data
return data
if __name__ == '__main__':
data = DataSetBuilder(df_commentary, df_ner_unique)
ner_lookup, ner_clean_lookup = data.get_NER_lookup()
data.get_commentaries()
#data.build_train_data()
#data.train_model()
#data.export_dataset_doccano()
#data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
#data.train_model()
# if __name__ == '__main__':
# data = DataSetBuilder(df_commentary, df_ner_unique)
#
# ner_lookup, ner_clean_lookup = data.get_NER_lookup()
# data.get_commentaries()
# data.build_train_data()
# data.train_model()
# data.export_dataset_doccano()
# data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
# data.train_model()