From 2324ddff9f629131c13e8910f24a1a9f6250d747 Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 23 Oct 2020 16:59:01 +0200 Subject: [PATCH] function to extend KB with data from annotated commentaries --- entity_linker/knowledge_base.py | 97 +++++++++++++----- .../{ => knowledge_base}/KB_abs_merged.pickle | Bin .../KB_abs_reversed.pickle | Bin main.py | 86 ++++++---------- preprocessing/ner_dataset_builder.py | 37 +++---- 5 files changed, 117 insertions(+), 103 deletions(-) rename entity_linker/{ => knowledge_base}/KB_abs_merged.pickle (100%) rename entity_linker/{ => knowledge_base}/KB_abs_reversed.pickle (100%) diff --git a/entity_linker/knowledge_base.py b/entity_linker/knowledge_base.py index cd73afa..6bdbc08 100644 --- a/entity_linker/knowledge_base.py +++ b/entity_linker/knowledge_base.py @@ -1,30 +1,33 @@ # TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz from difflib import SequenceMatcher import json +import numpy as np class KnowledgeBase: - - def __init__(self, kb_path): + def __init__(self, kb_path, extension=None): with open(kb_path, 'rb') as infile: data = json.load(infile) self.id2aut = data self.aut2id = {} + self.works2aut = {} self._popolate_aut2id() + if extension is not None: + self._extend_kb(extension) + self._popolate_aut2id() def link_entities(self, preds, deepfuzz=False): PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante'] WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART'] - print('-'*50) - print(f'Candidate authors (i.e., entitites matched): {PER_preds}') - # print(f'Candidates work:\n{WORK_preds}') + # print('-'*50) + # print(f'Candidate authors (i.e., entities matched): {PER_preds}') + # print(f'Candidates work :{WORK_preds}') COMMEDIA_DATE = 1321 - print('-'*50 + '\nChecking in KB...') - - # TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino) + print('-'*50 + '\n\nOUTPUT:\n### Author matches:') + aut_res = [] for target in set(PER_preds): scores = [] deepscore = [] @@ -36,8 +39,10 @@ class KnowledgeBase: success = False for i in range(3): if scores[i][1] > .8: - print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}') + print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}') + #, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}') success = True + aut_res.append(target) break if deepfuzz and not success: for aut in self.aut2id.keys(): @@ -50,38 +55,74 @@ class KnowledgeBase: deepscore.append((aut, sim)) deepscore.sort(key=lambda tup: tup[1], reverse=True) for j in range(3): - if deepscore[j][1] > .8: + if deepscore[j][1] > .9: print( - f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}') + f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}') + #, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}') + aut_res.append(target) break - - return 0 - def _generate_utter_2_ent(self): - utt_2_ent = {} - for ent_en in self.kb.keys(): - for utt in self.kb[ent_en]['names']: - utt_2_ent[utt] = ent_en - return utt_2_ent + work_res = {} + if len(WORK_preds) != 0: + print('-' * 50 + '\n### Works matches:') + for target in set(WORK_preds): + scores_work = [] + for work in self.works2aut.keys(): + sim = self._similar(target, work) + scores_work.append((work, sim)) + scores_work.sort(key=lambda tup: tup[1], reverse=True) + for i in range(3): + if scores_work[i][1] > .75: + print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}') + work_res[target] = self.works2aut[scores_work[i][0]] + break - def _check_other_lang(self, target, original_name): - other_names = self.kb[target[0]]['names'] - - scores = [] - for name in other_names: - sim = self._similar(original_name, name) - scores.append((name, sim)) - scores.sort(key=lambda tup: tup[1], reverse=True) - return scores + return aut_res, work_res def _similar(self,a, b): return SequenceMatcher(None, a, b).ratio() def _popolate_aut2id(self): for qid, values in self.id2aut.items(): + if qid == 'null': + continue if values is not None: l_names = set(values['aut_name'].values()) for name in l_names: self.aut2id[name] = qid + works = values['aut_works'] + if len(works) != 0: + for wid, wvalues in works.items(): + try: + self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it'] + except: + continue + return self + def _extend_kb(self, df): + _qid = 0 + prev_work = '' + for i in range(len(df)): + row = df.iloc[i] + auth = row.quot_author + work = row.quot_title + if auth is not np.nan and work is not np.nan: + if work != prev_work: + try: + qid = self.aut2id[auth] + new_wid = f'W{_qid}' + _qid += 1 + self.id2aut[qid]['aut_works'][new_wid] = {'it': work} + prev_work = work + except: + new_qid = f'Q{str(_qid)}' + new_wid = f'W{str(_qid)}' + _qid += 1 + self.id2aut[new_qid] = {'aut_name': {'it': auth}, + 'aut_works': {new_wid: {'it': work}}, + 'aut_present_work': {}, + 'birth': 0} + prev_work = work + else: + continue \ No newline at end of file diff --git a/entity_linker/KB_abs_merged.pickle b/entity_linker/knowledge_base/KB_abs_merged.pickle similarity index 100% rename from entity_linker/KB_abs_merged.pickle rename to entity_linker/knowledge_base/KB_abs_merged.pickle diff --git a/entity_linker/KB_abs_reversed.pickle b/entity_linker/knowledge_base/KB_abs_reversed.pickle similarity index 100% rename from entity_linker/KB_abs_reversed.pickle rename to entity_linker/knowledge_base/KB_abs_reversed.pickle diff --git a/main.py b/main.py index 0a16215..76c920b 100644 --- a/main.py +++ b/main.py @@ -5,9 +5,8 @@ from spacy.util import minibatch, compounding import warnings from preprocessing.ner_dataset_builder import DataSetBuilder from entity_linker.knowledge_base import KnowledgeBase -from tqdm import tqdm +from tqdm import tqdm, trange from pathlib import Path -import pickle import random @@ -18,7 +17,16 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv') df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv') -def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'): +def pprint_com(comment, l=100): + i = 0 + while len(comment) > i + 100: + j = i + l + print(comment[i:j]) + i += 100 + print(comment[i:len(comment)]) + + +def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'): model = spacy.load(SPACY_MODEL_STD) print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') @@ -36,35 +44,22 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL with model.disable_pipes(*other_pipes) and warnings.catch_warnings(): print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}') - # warnings.filterwarnings("once", category=UserWarning, module='spacy') - optimizer = model.resume_training() - - n_epochs = 7 - #batch_size = 32 + n_epochs = nepochs print(f'\n## Begin Training') - for i in tqdm(range(n_epochs), desc='Iter'): - #print(f'Iteration {i+1}') + t = trange(n_epochs, desc='Iter') + for i in t: losses = {} random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: docs, golds = zip(*batch) model.update(docs, golds, sgd=optimizer, losses=losses) + t.set_description(f'NER loss: {round(losses["ner"], 5)}') print(f'Final loss: {losses}') seed = random.randint(1, len(clean_commentaries)) - - def pprint_com(comment, l=100): - i = 0 - while len(comment) > i+100: - j = i+l - print(comment[i:j]) - i += 100 - print(comment[i:len(comment)]) - disabled.restore() - print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') eg_eval = df_eval.iloc[seed] @@ -80,21 +75,11 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL for ent in doc.ents: print(ent.text, ent.label_) - """ - print('\n') - print('-'*50) - print('STANDARD NER MODEL PREDICTIONS:') - nlp_reloaded = spacy.load('it_core_news_sm') - doc_STD = nlp_reloaded(clean_comment) - for ent in doc_STD.ents: - print(ent.text, ent.label_) - """ - print('\n') print('-'*50) print('GOLD:') query = eg_eval['comment'] - gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']] + gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']] print(gold) print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') @@ -104,15 +89,6 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL def predict_candidates(model, comment, labels=None): - - def pprint_com(comment, l=100): - i = 0 - while len(comment) > i+100: - j = i+l - print(comment[i:j]) - i += 100 - print(comment[i:len(comment)]) - clean_comment = comment.replace('', '') clean_comment = clean_comment.replace('', '') clean_comment = clean_comment.replace('\\', '') @@ -123,7 +99,6 @@ def predict_candidates(model, comment, labels=None): print('\n') candidates = [(ent.text, ent.label_) for ent in doc.ents] - #print(candidates) if labels is not None: query = comment @@ -131,7 +106,7 @@ def predict_candidates(model, comment, labels=None): print(f'{len(gold)} GOLD TARGETS ' + '-'*50) for i in range(len(gold)): elem = gold.iloc[i] - print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}') + print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}') # print('\n') return candidates, gold @@ -168,6 +143,13 @@ def load_word_vectors(model, path_to_vec, max_vec=100000): return model +def connect_aut_work(list_aut, list_work, kb): + print('\n\nTODO') + # qid_list = [kb.aut2id[author] for author in list_aut] + # wid_list = [kb.works2aut[work] for work in list_work] + # print('lel') + + def main(): df_TRAIN = df_monarchia df_eval = df_convivio @@ -178,21 +160,21 @@ def main(): raw_commentaries_convivio = dataset.commentaries_eva # train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest') - # nlp = spacy.load('it_core_news_sm') - # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000) - # dataset_convivio = DataSetBuilder(df_eval, df_eval) - # dataset_convivio.export_dataset_doccano('std_convivio') - nlp = spacy.load('./model_fastText/model_spacy_latest') - # nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000) - # print(len(list(nlp.vocab.strings))) # get whole model vocabulary seed = random.randint(1, len(commentaries_convivio_eva)) preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval) - kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json') - kb.link_entities(preds, deepfuzz=True) - print(f'\nComment Numbert: {seed}') + kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval) + aut_res, work_res = kb.link_entities(preds, deepfuzz=True) + + # Testing ------------------------ + # connect_aut_work(aut_res, work_res, kb) + # -------------------------------- + + print(f'\nComment Number: {seed}') + + # TODO: add a matcher that returns s_char and end_char of the matched entities! exit() diff --git a/preprocessing/ner_dataset_builder.py b/preprocessing/ner_dataset_builder.py index 7b9309a..189edfe 100644 --- a/preprocessing/ner_dataset_builder.py +++ b/preprocessing/ner_dataset_builder.py @@ -92,7 +92,6 @@ class DataSetBuilder: self.ner_clean_lookup = ner_clean_lookup return ner_lookup, ner_clean_lookup - def _annotate_commentaries(self): """ @@ -124,7 +123,6 @@ class DataSetBuilder: matches_in_clean_commentaries.append(res) return matches_in_commentaries, matches_in_clean_commentaries - def build_train_data(self): from collections import OrderedDict @@ -142,7 +140,6 @@ class DataSetBuilder: self.TRAIN_DATA = TRAIN_DATA return TRAIN_DATA - def get_rehearsal_data(self): revision_data = [] print('# NB: TAGGING WITH standard spacy model!') @@ -155,13 +152,12 @@ class DataSetBuilder: self.revision_data = revision_data return revision_data - def train_model(self): from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/ import random nlp_std = spacy.load(self.SPACY_MODEL_STD) - + #revision_data = self.get_rehearsal_data() #REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150] @@ -178,10 +174,10 @@ class DataSetBuilder: other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions] optimizer = nlp_std.resume_training() - + with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings(): warnings.filterwarnings("once", category=UserWarning, module='spacy') - + n_epochs = 10 #batch_size = 32 print(f'\n## Begin Training') @@ -191,9 +187,7 @@ class DataSetBuilder: random.shuffle(REHEARSAL_DATA) batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: - #for batch in partition_all(batch_size, REHEARSAL_DATA): docs, golds = zip(*batch) - #texts, annotations = zip(*batch) nlp_std.update(docs, golds, sgd=optimizer, losses=losses) print(f'loss: {losses}') @@ -221,7 +215,6 @@ class DataSetBuilder: for ent in doc_STD.ents: print(ent.text, ent.label_) - def export_dataset_doccano(self, outputfile_name): """ Doccano JSONL data format: @@ -239,7 +232,6 @@ class DataSetBuilder: with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer: writer.write_all(output) - def merge_rehearsed(self, data, revision_data): res = [] revision_data = revision_data @@ -258,7 +250,6 @@ class DataSetBuilder: return res - def import_dataset_doccano(self, path): data = [] with open(path) as infile: @@ -266,20 +257,20 @@ class DataSetBuilder: for line in content: json_data = json.loads(line) - ent = {'entities':[]} + ent = {'entities': []} ent['entities'] = json_data['labels'] data.append((json_data['text'], ent)) self.TRAIN_DATA = data return data -if __name__ == '__main__': - data = DataSetBuilder(df_commentary, df_ner_unique) - - ner_lookup, ner_clean_lookup = data.get_NER_lookup() - data.get_commentaries() - #data.build_train_data() - #data.train_model() - #data.export_dataset_doccano() - #data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1') - #data.train_model() +# if __name__ == '__main__': +# data = DataSetBuilder(df_commentary, df_ner_unique) +# +# ner_lookup, ner_clean_lookup = data.get_NER_lookup() +# data.get_commentaries() +# data.build_train_data() +# data.train_model() +# data.export_dataset_doccano() +# data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1') +# data.train_model()