import pandas as pd import spacy import numpy as np from spacy.util import minibatch, compounding import warnings from preprocessing.ner_dataset_builder import DataSetBuilder from entity_linker.knowledge_base import KnowledgeBase from tqdm import tqdm, trange from pathlib import Path import random COMMENTARIES_PATH = './commentaries/' DF_COMMENTARIES_PATH = './commentaries/data_parsed/' df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv') df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv') df_rime = pd.read_csv(DF_COMMENTARIES_PATH + 'rime_DF.csv') df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv') # TODO --> rime.xml def pprint_com(comment, l=100): i = 0 while len(comment) > i + 100: j = i + l print(comment[i:j]) i += 100 print(comment[i:len(comment)]) def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'): model = spacy.load(SPACY_MODEL_STD) print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') ner = model.get_pipe('ner') for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) disabled = model.disable_pipes("tagger", "parser") pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [pipe for pipe in model.pipe_names if pipe not in pipe_exceptions] with model.disable_pipes(*other_pipes) and warnings.catch_warnings(): print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}') optimizer = model.resume_training() n_epochs = nepochs print(f'\n## Begin Training') t = trange(n_epochs, desc='Iter') for i in t: losses = {} random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: docs, golds = zip(*batch) model.update(docs, golds, sgd=optimizer, losses=losses) t.set_description(f'NER loss: {round(losses["ner"], 5)}') print(f'Final loss: {losses}') seed = random.randint(1, len(clean_commentaries)) disabled.restore() print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') eg_eval = df_eval.iloc[seed] test_text = eg_eval['comment'] clean_comment = test_text.replace('', '') clean_comment = clean_comment.replace('', '') clean_comment = clean_comment.replace('\\', '') print(f'\nSENTENCE:') pprint_com(clean_comment) doc = model(clean_comment) print('\nFINED-TUNED NER MODEL PREDICTIONS:') for ent in doc.ents: print(ent.text, ent.label_) print('\n') print('-'*50) print('GOLD:') query = eg_eval['comment'] gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']] print(gold) print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}') save_model(model, 'it_dante_new', output_dir) return model def predict_candidates(model, comment, labels=None): clean_comment = comment.replace('', '') clean_comment = clean_comment.replace('', '') clean_comment = clean_comment.replace('\\', '') print() pprint_com(clean_comment) doc = model(clean_comment) print('\n') candidates = [(ent.text, ent.label_) for ent in doc.ents] if labels is not None: query = comment gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']] print(f'{len(gold)} GOLD TARGETS:') for i in range(len(gold)): elem = gold.iloc[i] print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}') # print('\n') return candidates, gold def save_model(model, new_model_name, output_dir): # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() model.meta["name"] = new_model_name model.to_disk(output_dir) print("Saved model to", output_dir) def load_word_vectors(model, path_to_vec, max_vec=100000): with open(path_to_vec, 'r') as infile: header = infile.readline() n_row, n_dim = header.split() model.vocab.reset_vectors(width=int(n_dim)) count = 0 for _, line in tqdm(enumerate(infile), total=max_vec): count += 1 line = line.rstrip() pieces = line.rsplit(' ', int(n_dim)) word = pieces[0] #print("{} - {}".format(count, word)) vector = np.asarray([float(v) for v in pieces[1:]], dtype='f') model.vocab.set_vector(word, vector) if count == max_vec: break return model def connect_aut_work(list_aut, list_work, kb): print('\n\nTODO') # qid_list = [kb.aut2id[author] for author in list_aut] # wid_list = [kb.works2aut[work] for work in list_work] def main(): df_TRAIN = df_monarchia df_eval = df_rime dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique) TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json') commentaries_convivio_eva = dataset.clean_commentaries_eva commentaries_monarchia = dataset.clean_commentaries raw_commentaries_convivio = dataset.commentaries_eva # train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest') nlp = spacy.load('./model_fastText/model_spacy_latest') seed = random.randint(1, len(commentaries_convivio_eva)) preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval) kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia.json', extension=df_eval) aut_res, work_res = kb.link_entities(preds, deepfuzz=True) # Testing ------------------------ # connect_aut_work(aut_res, work_res, kb) # -------------------------------- print(f'\nComment Number: {seed}') # TODO: add a matcher that returns s_char and end_char of the matched entities! exit() if __name__ == '__main__': main()