function to extend KB with data from annotated commentaries
This commit is contained in:
parent
fb84b36b90
commit
2324ddff9f
|
|
@ -1,30 +1,33 @@
|
|||
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
|
||||
from difflib import SequenceMatcher
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
|
||||
class KnowledgeBase:
|
||||
|
||||
def __init__(self, kb_path):
|
||||
def __init__(self, kb_path, extension=None):
|
||||
with open(kb_path, 'rb') as infile:
|
||||
data = json.load(infile)
|
||||
|
||||
self.id2aut = data
|
||||
self.aut2id = {}
|
||||
self.works2aut = {}
|
||||
self._popolate_aut2id()
|
||||
if extension is not None:
|
||||
self._extend_kb(extension)
|
||||
self._popolate_aut2id()
|
||||
|
||||
def link_entities(self, preds, deepfuzz=False):
|
||||
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
|
||||
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
|
||||
print('-'*50)
|
||||
print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
|
||||
# print(f'Candidates work:\n{WORK_preds}')
|
||||
# print('-'*50)
|
||||
# print(f'Candidate authors (i.e., entities matched): {PER_preds}')
|
||||
# print(f'Candidates work :{WORK_preds}')
|
||||
|
||||
COMMEDIA_DATE = 1321
|
||||
print('-'*50 + '\nChecking in KB...')
|
||||
|
||||
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
|
||||
print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
|
||||
|
||||
aut_res = []
|
||||
for target in set(PER_preds):
|
||||
scores = []
|
||||
deepscore = []
|
||||
|
|
@ -36,8 +39,10 @@ class KnowledgeBase:
|
|||
success = False
|
||||
for i in range(3):
|
||||
if scores[i][1] > .8:
|
||||
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
|
||||
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
|
||||
#, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
|
||||
success = True
|
||||
aut_res.append(target)
|
||||
break
|
||||
if deepfuzz and not success:
|
||||
for aut in self.aut2id.keys():
|
||||
|
|
@ -50,38 +55,74 @@ class KnowledgeBase:
|
|||
deepscore.append((aut, sim))
|
||||
deepscore.sort(key=lambda tup: tup[1], reverse=True)
|
||||
for j in range(3):
|
||||
if deepscore[j][1] > .8:
|
||||
if deepscore[j][1] > .9:
|
||||
print(
|
||||
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
|
||||
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
|
||||
#, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
|
||||
aut_res.append(target)
|
||||
break
|
||||
|
||||
return 0
|
||||
|
||||
def _generate_utter_2_ent(self):
|
||||
utt_2_ent = {}
|
||||
for ent_en in self.kb.keys():
|
||||
for utt in self.kb[ent_en]['names']:
|
||||
utt_2_ent[utt] = ent_en
|
||||
return utt_2_ent
|
||||
work_res = {}
|
||||
if len(WORK_preds) != 0:
|
||||
print('-' * 50 + '\n### Works matches:')
|
||||
for target in set(WORK_preds):
|
||||
scores_work = []
|
||||
for work in self.works2aut.keys():
|
||||
sim = self._similar(target, work)
|
||||
scores_work.append((work, sim))
|
||||
scores_work.sort(key=lambda tup: tup[1], reverse=True)
|
||||
for i in range(3):
|
||||
if scores_work[i][1] > .75:
|
||||
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
|
||||
work_res[target] = self.works2aut[scores_work[i][0]]
|
||||
break
|
||||
|
||||
def _check_other_lang(self, target, original_name):
|
||||
other_names = self.kb[target[0]]['names']
|
||||
|
||||
scores = []
|
||||
for name in other_names:
|
||||
sim = self._similar(original_name, name)
|
||||
scores.append((name, sim))
|
||||
scores.sort(key=lambda tup: tup[1], reverse=True)
|
||||
return scores
|
||||
return aut_res, work_res
|
||||
|
||||
def _similar(self,a, b):
|
||||
return SequenceMatcher(None, a, b).ratio()
|
||||
|
||||
def _popolate_aut2id(self):
|
||||
for qid, values in self.id2aut.items():
|
||||
if qid == 'null':
|
||||
continue
|
||||
if values is not None:
|
||||
l_names = set(values['aut_name'].values())
|
||||
for name in l_names:
|
||||
self.aut2id[name] = qid
|
||||
works = values['aut_works']
|
||||
if len(works) != 0:
|
||||
for wid, wvalues in works.items():
|
||||
try:
|
||||
self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
|
||||
except:
|
||||
continue
|
||||
|
||||
return self
|
||||
|
||||
def _extend_kb(self, df):
|
||||
_qid = 0
|
||||
prev_work = ''
|
||||
for i in range(len(df)):
|
||||
row = df.iloc[i]
|
||||
auth = row.quot_author
|
||||
work = row.quot_title
|
||||
if auth is not np.nan and work is not np.nan:
|
||||
if work != prev_work:
|
||||
try:
|
||||
qid = self.aut2id[auth]
|
||||
new_wid = f'W{_qid}'
|
||||
_qid += 1
|
||||
self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
|
||||
prev_work = work
|
||||
except:
|
||||
new_qid = f'Q{str(_qid)}'
|
||||
new_wid = f'W{str(_qid)}'
|
||||
_qid += 1
|
||||
self.id2aut[new_qid] = {'aut_name': {'it': auth},
|
||||
'aut_works': {new_wid: {'it': work}},
|
||||
'aut_present_work': {},
|
||||
'birth': 0}
|
||||
prev_work = work
|
||||
else:
|
||||
continue
|
||||
86
main.py
86
main.py
|
|
@ -5,9 +5,8 @@ from spacy.util import minibatch, compounding
|
|||
import warnings
|
||||
from preprocessing.ner_dataset_builder import DataSetBuilder
|
||||
from entity_linker.knowledge_base import KnowledgeBase
|
||||
from tqdm import tqdm
|
||||
from tqdm import tqdm, trange
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
import random
|
||||
|
||||
|
||||
|
|
@ -18,7 +17,16 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
|
|||
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
||||
|
||||
|
||||
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
|
||||
def pprint_com(comment, l=100):
|
||||
i = 0
|
||||
while len(comment) > i + 100:
|
||||
j = i + l
|
||||
print(comment[i:j])
|
||||
i += 100
|
||||
print(comment[i:len(comment)])
|
||||
|
||||
|
||||
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'):
|
||||
|
||||
model = spacy.load(SPACY_MODEL_STD)
|
||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||
|
|
@ -36,35 +44,22 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
|
|||
|
||||
with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||
print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
|
||||
# warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
|
||||
optimizer = model.resume_training()
|
||||
|
||||
n_epochs = 7
|
||||
#batch_size = 32
|
||||
n_epochs = nepochs
|
||||
print(f'\n## Begin Training')
|
||||
for i in tqdm(range(n_epochs), desc='Iter'):
|
||||
#print(f'Iteration {i+1}')
|
||||
t = trange(n_epochs, desc='Iter')
|
||||
for i in t:
|
||||
losses = {}
|
||||
random.shuffle(TRAIN_DATA)
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
docs, golds = zip(*batch)
|
||||
model.update(docs, golds, sgd=optimizer, losses=losses)
|
||||
t.set_description(f'NER loss: {round(losses["ner"], 5)}')
|
||||
print(f'Final loss: {losses}')
|
||||
|
||||
seed = random.randint(1, len(clean_commentaries))
|
||||
|
||||
def pprint_com(comment, l=100):
|
||||
i = 0
|
||||
while len(comment) > i+100:
|
||||
j = i+l
|
||||
print(comment[i:j])
|
||||
i += 100
|
||||
print(comment[i:len(comment)])
|
||||
|
||||
disabled.restore()
|
||||
|
||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||
|
||||
eg_eval = df_eval.iloc[seed]
|
||||
|
|
@ -80,21 +75,11 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
|
|||
for ent in doc.ents:
|
||||
print(ent.text, ent.label_)
|
||||
|
||||
"""
|
||||
print('\n')
|
||||
print('-'*50)
|
||||
print('STANDARD NER MODEL PREDICTIONS:')
|
||||
nlp_reloaded = spacy.load('it_core_news_sm')
|
||||
doc_STD = nlp_reloaded(clean_comment)
|
||||
for ent in doc_STD.ents:
|
||||
print(ent.text, ent.label_)
|
||||
"""
|
||||
|
||||
print('\n')
|
||||
print('-'*50)
|
||||
print('GOLD:')
|
||||
query = eg_eval['comment']
|
||||
gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
|
||||
gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']]
|
||||
print(gold)
|
||||
|
||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||
|
|
@ -104,15 +89,6 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
|
|||
|
||||
|
||||
def predict_candidates(model, comment, labels=None):
|
||||
|
||||
def pprint_com(comment, l=100):
|
||||
i = 0
|
||||
while len(comment) > i+100:
|
||||
j = i+l
|
||||
print(comment[i:j])
|
||||
i += 100
|
||||
print(comment[i:len(comment)])
|
||||
|
||||
clean_comment = comment.replace('<i>', '')
|
||||
clean_comment = clean_comment.replace('</i>', '')
|
||||
clean_comment = clean_comment.replace('\\', '')
|
||||
|
|
@ -123,7 +99,6 @@ def predict_candidates(model, comment, labels=None):
|
|||
|
||||
print('\n')
|
||||
candidates = [(ent.text, ent.label_) for ent in doc.ents]
|
||||
#print(candidates)
|
||||
|
||||
if labels is not None:
|
||||
query = comment
|
||||
|
|
@ -131,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
|
|||
print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
|
||||
for i in range(len(gold)):
|
||||
elem = gold.iloc[i]
|
||||
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
|
||||
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
|
||||
# print('\n')
|
||||
|
||||
return candidates, gold
|
||||
|
|
@ -168,6 +143,13 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
|
|||
return model
|
||||
|
||||
|
||||
def connect_aut_work(list_aut, list_work, kb):
|
||||
print('\n\nTODO')
|
||||
# qid_list = [kb.aut2id[author] for author in list_aut]
|
||||
# wid_list = [kb.works2aut[work] for work in list_work]
|
||||
# print('lel')
|
||||
|
||||
|
||||
def main():
|
||||
df_TRAIN = df_monarchia
|
||||
df_eval = df_convivio
|
||||
|
|
@ -178,21 +160,21 @@ def main():
|
|||
raw_commentaries_convivio = dataset.commentaries_eva
|
||||
|
||||
# train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
|
||||
# nlp = spacy.load('it_core_news_sm')
|
||||
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
||||
# dataset_convivio = DataSetBuilder(df_eval, df_eval)
|
||||
# dataset_convivio.export_dataset_doccano('std_convivio')
|
||||
|
||||
nlp = spacy.load('./model_fastText/model_spacy_latest')
|
||||
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
||||
# print(len(list(nlp.vocab.strings))) # get whole model vocabulary
|
||||
|
||||
seed = random.randint(1, len(commentaries_convivio_eva))
|
||||
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
|
||||
|
||||
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
|
||||
kb.link_entities(preds, deepfuzz=True)
|
||||
print(f'\nComment Numbert: {seed}')
|
||||
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
|
||||
aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
|
||||
|
||||
# Testing ------------------------
|
||||
# connect_aut_work(aut_res, work_res, kb)
|
||||
# --------------------------------
|
||||
|
||||
print(f'\nComment Number: {seed}')
|
||||
|
||||
# TODO: add a matcher that returns s_char and end_char of the matched entities!
|
||||
|
||||
exit()
|
||||
|
||||
|
|
|
|||
|
|
@ -92,7 +92,6 @@ class DataSetBuilder:
|
|||
self.ner_clean_lookup = ner_clean_lookup
|
||||
|
||||
return ner_lookup, ner_clean_lookup
|
||||
|
||||
|
||||
def _annotate_commentaries(self):
|
||||
"""
|
||||
|
|
@ -124,7 +123,6 @@ class DataSetBuilder:
|
|||
matches_in_clean_commentaries.append(res)
|
||||
|
||||
return matches_in_commentaries, matches_in_clean_commentaries
|
||||
|
||||
|
||||
def build_train_data(self):
|
||||
from collections import OrderedDict
|
||||
|
|
@ -142,7 +140,6 @@ class DataSetBuilder:
|
|||
self.TRAIN_DATA = TRAIN_DATA
|
||||
return TRAIN_DATA
|
||||
|
||||
|
||||
def get_rehearsal_data(self):
|
||||
revision_data = []
|
||||
print('# NB: TAGGING WITH standard spacy model!')
|
||||
|
|
@ -155,13 +152,12 @@ class DataSetBuilder:
|
|||
self.revision_data = revision_data
|
||||
return revision_data
|
||||
|
||||
|
||||
def train_model(self):
|
||||
from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/
|
||||
import random
|
||||
|
||||
nlp_std = spacy.load(self.SPACY_MODEL_STD)
|
||||
|
||||
|
||||
#revision_data = self.get_rehearsal_data()
|
||||
|
||||
#REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150]
|
||||
|
|
@ -178,10 +174,10 @@ class DataSetBuilder:
|
|||
other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions]
|
||||
|
||||
optimizer = nlp_std.resume_training()
|
||||
|
||||
|
||||
with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
|
||||
|
||||
n_epochs = 10
|
||||
#batch_size = 32
|
||||
print(f'\n## Begin Training')
|
||||
|
|
@ -191,9 +187,7 @@ class DataSetBuilder:
|
|||
random.shuffle(REHEARSAL_DATA)
|
||||
batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
#for batch in partition_all(batch_size, REHEARSAL_DATA):
|
||||
docs, golds = zip(*batch)
|
||||
#texts, annotations = zip(*batch)
|
||||
nlp_std.update(docs, golds, sgd=optimizer, losses=losses)
|
||||
print(f'loss: {losses}')
|
||||
|
||||
|
|
@ -221,7 +215,6 @@ class DataSetBuilder:
|
|||
for ent in doc_STD.ents:
|
||||
print(ent.text, ent.label_)
|
||||
|
||||
|
||||
def export_dataset_doccano(self, outputfile_name):
|
||||
"""
|
||||
Doccano JSONL data format:
|
||||
|
|
@ -239,7 +232,6 @@ class DataSetBuilder:
|
|||
with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer:
|
||||
writer.write_all(output)
|
||||
|
||||
|
||||
def merge_rehearsed(self, data, revision_data):
|
||||
res = []
|
||||
revision_data = revision_data
|
||||
|
|
@ -258,7 +250,6 @@ class DataSetBuilder:
|
|||
|
||||
return res
|
||||
|
||||
|
||||
def import_dataset_doccano(self, path):
|
||||
data = []
|
||||
with open(path) as infile:
|
||||
|
|
@ -266,20 +257,20 @@ class DataSetBuilder:
|
|||
|
||||
for line in content:
|
||||
json_data = json.loads(line)
|
||||
ent = {'entities':[]}
|
||||
ent = {'entities': []}
|
||||
ent['entities'] = json_data['labels']
|
||||
data.append((json_data['text'], ent))
|
||||
|
||||
self.TRAIN_DATA = data
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
data = DataSetBuilder(df_commentary, df_ner_unique)
|
||||
|
||||
ner_lookup, ner_clean_lookup = data.get_NER_lookup()
|
||||
data.get_commentaries()
|
||||
#data.build_train_data()
|
||||
#data.train_model()
|
||||
#data.export_dataset_doccano()
|
||||
#data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
|
||||
#data.train_model()
|
||||
# if __name__ == '__main__':
|
||||
# data = DataSetBuilder(df_commentary, df_ner_unique)
|
||||
#
|
||||
# ner_lookup, ner_clean_lookup = data.get_NER_lookup()
|
||||
# data.get_commentaries()
|
||||
# data.build_train_data()
|
||||
# data.train_model()
|
||||
# data.export_dataset_doccano()
|
||||
# data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
|
||||
# data.train_model()
|
||||
|
|
|
|||
Loading…
Reference in New Issue