function to extend KB with data from annotated commentaries

This commit is contained in:
andrea 2020-10-23 16:59:01 +02:00
parent fb84b36b90
commit 2324ddff9f
5 changed files with 117 additions and 103 deletions

View File

@ -1,30 +1,33 @@
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz # TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
from difflib import SequenceMatcher from difflib import SequenceMatcher
import json import json
import numpy as np
class KnowledgeBase: class KnowledgeBase:
def __init__(self, kb_path, extension=None):
def __init__(self, kb_path):
with open(kb_path, 'rb') as infile: with open(kb_path, 'rb') as infile:
data = json.load(infile) data = json.load(infile)
self.id2aut = data self.id2aut = data
self.aut2id = {} self.aut2id = {}
self.works2aut = {}
self._popolate_aut2id() self._popolate_aut2id()
if extension is not None:
self._extend_kb(extension)
self._popolate_aut2id()
def link_entities(self, preds, deepfuzz=False): def link_entities(self, preds, deepfuzz=False):
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante'] PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
print('-'*50) # print('-'*50)
print(f'Candidate authors (i.e., entitites matched): {PER_preds}') # print(f'Candidate authors (i.e., entities matched): {PER_preds}')
# print(f'Candidates work:\n{WORK_preds}') # print(f'Candidates work :{WORK_preds}')
COMMEDIA_DATE = 1321 COMMEDIA_DATE = 1321
print('-'*50 + '\nChecking in KB...') print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
aut_res = []
for target in set(PER_preds): for target in set(PER_preds):
scores = [] scores = []
deepscore = [] deepscore = []
@ -36,8 +39,10 @@ class KnowledgeBase:
success = False success = False
for i in range(3): for i in range(3):
if scores[i][1] > .8: if scores[i][1] > .8:
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}') print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
#, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
success = True success = True
aut_res.append(target)
break break
if deepfuzz and not success: if deepfuzz and not success:
for aut in self.aut2id.keys(): for aut in self.aut2id.keys():
@ -50,38 +55,74 @@ class KnowledgeBase:
deepscore.append((aut, sim)) deepscore.append((aut, sim))
deepscore.sort(key=lambda tup: tup[1], reverse=True) deepscore.sort(key=lambda tup: tup[1], reverse=True)
for j in range(3): for j in range(3):
if deepscore[j][1] > .8: if deepscore[j][1] > .9:
print( print(
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}') f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
#, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
aut_res.append(target)
break break
return 0
def _generate_utter_2_ent(self): work_res = {}
utt_2_ent = {} if len(WORK_preds) != 0:
for ent_en in self.kb.keys(): print('-' * 50 + '\n### Works matches:')
for utt in self.kb[ent_en]['names']: for target in set(WORK_preds):
utt_2_ent[utt] = ent_en scores_work = []
return utt_2_ent for work in self.works2aut.keys():
sim = self._similar(target, work)
scores_work.append((work, sim))
scores_work.sort(key=lambda tup: tup[1], reverse=True)
for i in range(3):
if scores_work[i][1] > .75:
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
work_res[target] = self.works2aut[scores_work[i][0]]
break
def _check_other_lang(self, target, original_name): return aut_res, work_res
other_names = self.kb[target[0]]['names']
scores = []
for name in other_names:
sim = self._similar(original_name, name)
scores.append((name, sim))
scores.sort(key=lambda tup: tup[1], reverse=True)
return scores
def _similar(self,a, b): def _similar(self,a, b):
return SequenceMatcher(None, a, b).ratio() return SequenceMatcher(None, a, b).ratio()
def _popolate_aut2id(self): def _popolate_aut2id(self):
for qid, values in self.id2aut.items(): for qid, values in self.id2aut.items():
if qid == 'null':
continue
if values is not None: if values is not None:
l_names = set(values['aut_name'].values()) l_names = set(values['aut_name'].values())
for name in l_names: for name in l_names:
self.aut2id[name] = qid self.aut2id[name] = qid
works = values['aut_works']
if len(works) != 0:
for wid, wvalues in works.items():
try:
self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
except:
continue
return self return self
def _extend_kb(self, df):
_qid = 0
prev_work = ''
for i in range(len(df)):
row = df.iloc[i]
auth = row.quot_author
work = row.quot_title
if auth is not np.nan and work is not np.nan:
if work != prev_work:
try:
qid = self.aut2id[auth]
new_wid = f'W{_qid}'
_qid += 1
self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
prev_work = work
except:
new_qid = f'Q{str(_qid)}'
new_wid = f'W{str(_qid)}'
_qid += 1
self.id2aut[new_qid] = {'aut_name': {'it': auth},
'aut_works': {new_wid: {'it': work}},
'aut_present_work': {},
'birth': 0}
prev_work = work
else:
continue

86
main.py
View File

@ -5,9 +5,8 @@ from spacy.util import minibatch, compounding
import warnings import warnings
from preprocessing.ner_dataset_builder import DataSetBuilder from preprocessing.ner_dataset_builder import DataSetBuilder
from entity_linker.knowledge_base import KnowledgeBase from entity_linker.knowledge_base import KnowledgeBase
from tqdm import tqdm from tqdm import tqdm, trange
from pathlib import Path from pathlib import Path
import pickle
import random import random
@ -18,7 +17,16 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv') df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'): def pprint_com(comment, l=100):
i = 0
while len(comment) > i + 100:
j = i + l
print(comment[i:j])
i += 100
print(comment[i:len(comment)])
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'):
model = spacy.load(SPACY_MODEL_STD) model = spacy.load(SPACY_MODEL_STD)
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -36,35 +44,22 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
with model.disable_pipes(*other_pipes) and warnings.catch_warnings(): with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}') print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
# warnings.filterwarnings("once", category=UserWarning, module='spacy')
optimizer = model.resume_training() optimizer = model.resume_training()
n_epochs = nepochs
n_epochs = 7
#batch_size = 32
print(f'\n## Begin Training') print(f'\n## Begin Training')
for i in tqdm(range(n_epochs), desc='Iter'): t = trange(n_epochs, desc='Iter')
#print(f'Iteration {i+1}') for i in t:
losses = {} losses = {}
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
docs, golds = zip(*batch) docs, golds = zip(*batch)
model.update(docs, golds, sgd=optimizer, losses=losses) model.update(docs, golds, sgd=optimizer, losses=losses)
t.set_description(f'NER loss: {round(losses["ner"], 5)}')
print(f'Final loss: {losses}') print(f'Final loss: {losses}')
seed = random.randint(1, len(clean_commentaries)) seed = random.randint(1, len(clean_commentaries))
def pprint_com(comment, l=100):
i = 0
while len(comment) > i+100:
j = i+l
print(comment[i:j])
i += 100
print(comment[i:len(comment)])
disabled.restore() disabled.restore()
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
eg_eval = df_eval.iloc[seed] eg_eval = df_eval.iloc[seed]
@ -80,21 +75,11 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
for ent in doc.ents: for ent in doc.ents:
print(ent.text, ent.label_) print(ent.text, ent.label_)
"""
print('\n')
print('-'*50)
print('STANDARD NER MODEL PREDICTIONS:')
nlp_reloaded = spacy.load('it_core_news_sm')
doc_STD = nlp_reloaded(clean_comment)
for ent in doc_STD.ents:
print(ent.text, ent.label_)
"""
print('\n') print('\n')
print('-'*50) print('-'*50)
print('GOLD:') print('GOLD:')
query = eg_eval['comment'] query = eg_eval['comment']
gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']] gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']]
print(gold) print(gold)
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
@ -104,15 +89,6 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
def predict_candidates(model, comment, labels=None): def predict_candidates(model, comment, labels=None):
def pprint_com(comment, l=100):
i = 0
while len(comment) > i+100:
j = i+l
print(comment[i:j])
i += 100
print(comment[i:len(comment)])
clean_comment = comment.replace('<i>', '') clean_comment = comment.replace('<i>', '')
clean_comment = clean_comment.replace('</i>', '') clean_comment = clean_comment.replace('</i>', '')
clean_comment = clean_comment.replace('\\', '') clean_comment = clean_comment.replace('\\', '')
@ -123,7 +99,6 @@ def predict_candidates(model, comment, labels=None):
print('\n') print('\n')
candidates = [(ent.text, ent.label_) for ent in doc.ents] candidates = [(ent.text, ent.label_) for ent in doc.ents]
#print(candidates)
if labels is not None: if labels is not None:
query = comment query = comment
@ -131,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
print(f'{len(gold)} GOLD TARGETS ' + '-'*50) print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
for i in range(len(gold)): for i in range(len(gold)):
elem = gold.iloc[i] elem = gold.iloc[i]
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}') print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
# print('\n') # print('\n')
return candidates, gold return candidates, gold
@ -168,6 +143,13 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
return model return model
def connect_aut_work(list_aut, list_work, kb):
print('\n\nTODO')
# qid_list = [kb.aut2id[author] for author in list_aut]
# wid_list = [kb.works2aut[work] for work in list_work]
# print('lel')
def main(): def main():
df_TRAIN = df_monarchia df_TRAIN = df_monarchia
df_eval = df_convivio df_eval = df_convivio
@ -178,21 +160,21 @@ def main():
raw_commentaries_convivio = dataset.commentaries_eva raw_commentaries_convivio = dataset.commentaries_eva
# train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest') # train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
# nlp = spacy.load('it_core_news_sm')
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
# dataset_convivio = DataSetBuilder(df_eval, df_eval)
# dataset_convivio.export_dataset_doccano('std_convivio')
nlp = spacy.load('./model_fastText/model_spacy_latest') nlp = spacy.load('./model_fastText/model_spacy_latest')
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
# print(len(list(nlp.vocab.strings))) # get whole model vocabulary
seed = random.randint(1, len(commentaries_convivio_eva)) seed = random.randint(1, len(commentaries_convivio_eva))
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval) preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json') kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
kb.link_entities(preds, deepfuzz=True) aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
print(f'\nComment Numbert: {seed}')
# Testing ------------------------
# connect_aut_work(aut_res, work_res, kb)
# --------------------------------
print(f'\nComment Number: {seed}')
# TODO: add a matcher that returns s_char and end_char of the matched entities!
exit() exit()

View File

@ -92,7 +92,6 @@ class DataSetBuilder:
self.ner_clean_lookup = ner_clean_lookup self.ner_clean_lookup = ner_clean_lookup
return ner_lookup, ner_clean_lookup return ner_lookup, ner_clean_lookup
def _annotate_commentaries(self): def _annotate_commentaries(self):
""" """
@ -124,7 +123,6 @@ class DataSetBuilder:
matches_in_clean_commentaries.append(res) matches_in_clean_commentaries.append(res)
return matches_in_commentaries, matches_in_clean_commentaries return matches_in_commentaries, matches_in_clean_commentaries
def build_train_data(self): def build_train_data(self):
from collections import OrderedDict from collections import OrderedDict
@ -142,7 +140,6 @@ class DataSetBuilder:
self.TRAIN_DATA = TRAIN_DATA self.TRAIN_DATA = TRAIN_DATA
return TRAIN_DATA return TRAIN_DATA
def get_rehearsal_data(self): def get_rehearsal_data(self):
revision_data = [] revision_data = []
print('# NB: TAGGING WITH standard spacy model!') print('# NB: TAGGING WITH standard spacy model!')
@ -155,13 +152,12 @@ class DataSetBuilder:
self.revision_data = revision_data self.revision_data = revision_data
return revision_data return revision_data
def train_model(self): def train_model(self):
from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/ from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/
import random import random
nlp_std = spacy.load(self.SPACY_MODEL_STD) nlp_std = spacy.load(self.SPACY_MODEL_STD)
#revision_data = self.get_rehearsal_data() #revision_data = self.get_rehearsal_data()
#REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150] #REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150]
@ -178,10 +174,10 @@ class DataSetBuilder:
other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions] other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions]
optimizer = nlp_std.resume_training() optimizer = nlp_std.resume_training()
with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings(): with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings():
warnings.filterwarnings("once", category=UserWarning, module='spacy') warnings.filterwarnings("once", category=UserWarning, module='spacy')
n_epochs = 10 n_epochs = 10
#batch_size = 32 #batch_size = 32
print(f'\n## Begin Training') print(f'\n## Begin Training')
@ -191,9 +187,7 @@ class DataSetBuilder:
random.shuffle(REHEARSAL_DATA) random.shuffle(REHEARSAL_DATA)
batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
#for batch in partition_all(batch_size, REHEARSAL_DATA):
docs, golds = zip(*batch) docs, golds = zip(*batch)
#texts, annotations = zip(*batch)
nlp_std.update(docs, golds, sgd=optimizer, losses=losses) nlp_std.update(docs, golds, sgd=optimizer, losses=losses)
print(f'loss: {losses}') print(f'loss: {losses}')
@ -221,7 +215,6 @@ class DataSetBuilder:
for ent in doc_STD.ents: for ent in doc_STD.ents:
print(ent.text, ent.label_) print(ent.text, ent.label_)
def export_dataset_doccano(self, outputfile_name): def export_dataset_doccano(self, outputfile_name):
""" """
Doccano JSONL data format: Doccano JSONL data format:
@ -239,7 +232,6 @@ class DataSetBuilder:
with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer: with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer:
writer.write_all(output) writer.write_all(output)
def merge_rehearsed(self, data, revision_data): def merge_rehearsed(self, data, revision_data):
res = [] res = []
revision_data = revision_data revision_data = revision_data
@ -258,7 +250,6 @@ class DataSetBuilder:
return res return res
def import_dataset_doccano(self, path): def import_dataset_doccano(self, path):
data = [] data = []
with open(path) as infile: with open(path) as infile:
@ -266,20 +257,20 @@ class DataSetBuilder:
for line in content: for line in content:
json_data = json.loads(line) json_data = json.loads(line)
ent = {'entities':[]} ent = {'entities': []}
ent['entities'] = json_data['labels'] ent['entities'] = json_data['labels']
data.append((json_data['text'], ent)) data.append((json_data['text'], ent))
self.TRAIN_DATA = data self.TRAIN_DATA = data
return data return data
if __name__ == '__main__': # if __name__ == '__main__':
data = DataSetBuilder(df_commentary, df_ner_unique) # data = DataSetBuilder(df_commentary, df_ner_unique)
#
ner_lookup, ner_clean_lookup = data.get_NER_lookup() # ner_lookup, ner_clean_lookup = data.get_NER_lookup()
data.get_commentaries() # data.get_commentaries()
#data.build_train_data() # data.build_train_data()
#data.train_model() # data.train_model()
#data.export_dataset_doccano() # data.export_dataset_doccano()
#data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1') # data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
#data.train_model() # data.train_model()