eventExtractionHDN/main.py

import pandas as pd
import spacy
import numpy as np
from spacy.util import minibatch, compounding
import warnings
from preprocessing.ner_dataset_builder import DataSetBuilder
from entity_linker.knowledge_base import KnowledgeBase
from tqdm import tqdm, trange
from pathlib import Path
import random


COMMENTARIES_PATH = './commentaries/'
DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
df_rime = pd.read_csv(DF_COMMENTARIES_PATH + 'rime_DF.csv')
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')

# TODO --> rime.xml


def pprint_com(comment, l=100):
    i = 0
    while len(comment) > i + 100:
        j = i + l
        print(comment[i:j])
        i += 100
    print(comment[i:len(comment)])


def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'):

        model = spacy.load(SPACY_MODEL_STD)
        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')

        ner = model.get_pipe('ner')

        for _, annotations in TRAIN_DATA:
            for ent in annotations.get("entities"):
                ner.add_label(ent[2])

        disabled = model.disable_pipes("tagger", "parser")

        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [pipe for pipe in model.pipe_names if pipe not in pipe_exceptions]

        with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
            print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
            optimizer = model.resume_training()
            n_epochs = nepochs
            print(f'\n## Begin Training')
            t = trange(n_epochs, desc='Iter')
            for i in t:
                losses = {}
                random.shuffle(TRAIN_DATA)
                batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    docs, golds = zip(*batch)
                    model.update(docs, golds, sgd=optimizer, losses=losses)
                t.set_description(f'NER loss: {round(losses["ner"], 5)}')
            print(f'Final loss: {losses}')

        seed = random.randint(1, len(clean_commentaries))
        disabled.restore()
        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')

        eg_eval = df_eval.iloc[seed]
        test_text = eg_eval['comment']
        clean_comment = test_text.replace('<i>', '')
        clean_comment = clean_comment.replace('</i>', '')
        clean_comment = clean_comment.replace('\\', '')

        print(f'\nSENTENCE:')
        pprint_com(clean_comment)
        doc = model(clean_comment)
        print('\nFINED-TUNED NER MODEL PREDICTIONS:')
        for ent in doc.ents:
          print(ent.text, ent.label_)

        print('\n')
        print('-'*50)
        print('GOLD:')
        query = eg_eval['comment']
        gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']]
        print(gold)

        print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
        save_model(model, 'it_dante_new', output_dir)

        return model


def predict_candidates(model, comment, labels=None):
    clean_comment = comment.replace('<i>', '')
    clean_comment = clean_comment.replace('</i>', '')
    clean_comment = clean_comment.replace('\\', '')

    print()
    pprint_com(clean_comment)
    doc = model(clean_comment)

    print('\n')
    candidates = [(ent.text, ent.label_) for ent in doc.ents]

    if labels is not None:
        query = comment
        gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
        print(f'{len(gold)} GOLD TARGETS:')
        for i in range(len(gold)):
            elem = gold.iloc[i]
            print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
        # print('\n')

    return candidates, gold


def save_model(model, new_model_name, output_dir):
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    model.meta["name"] = new_model_name
    model.to_disk(output_dir)
    print("Saved model to", output_dir)


def load_word_vectors(model, path_to_vec, max_vec=100000):
    with open(path_to_vec, 'r') as infile:
        header = infile.readline()
        n_row, n_dim = header.split()
        model.vocab.reset_vectors(width=int(n_dim))
        count = 0
        for _, line in tqdm(enumerate(infile), total=max_vec):
            count += 1
            line = line.rstrip()
            pieces = line.rsplit(' ', int(n_dim))
            word = pieces[0]
            #print("{} - {}".format(count, word))
            vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
            model.vocab.set_vector(word, vector)
            if count == max_vec:
                break

    return model


def connect_aut_work(list_aut, list_work, kb):
    print('\n\nTODO')
    # qid_list = [kb.aut2id[author] for author in list_aut]
    # wid_list = [kb.works2aut[work] for work in list_work]


def main():
    df_TRAIN = df_monarchia
    df_eval = df_rime
    dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)

    TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
    commentaries_convivio_eva = dataset.clean_commentaries_eva
    commentaries_monarchia = dataset.clean_commentaries
    raw_commentaries_convivio = dataset.commentaries_eva

    # train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
    nlp = spacy.load('./model_fastText/model_spacy_latest')

    seed = random.randint(1, len(commentaries_convivio_eva))

    preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
    kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia.json', extension=df_eval)
    aut_res, work_res = kb.link_entities(preds, deepfuzz=True)

    # Testing ------------------------
    # connect_aut_work(aut_res, work_res, kb)
    # --------------------------------

    print(f'\nComment Number: {seed}')

    # TODO: add a matcher that returns s_char and end_char of the matched entities!

    exit()


if __name__ == '__main__':
    main()