202 lines
6.9 KiB
Python
202 lines
6.9 KiB
Python
import pandas as pd
|
|
import spacy
|
|
import numpy as np
|
|
from spacy.util import minibatch, compounding
|
|
import warnings
|
|
from preprocessing.ner_dataset_builder import DataSetBuilder
|
|
from entity_linker.knowledge_base import KnowledgeBase
|
|
from tqdm import tqdm
|
|
from pathlib import Path
|
|
import pickle
|
|
import random
|
|
|
|
|
|
COMMENTARIES_PATH = './commentaries/'
|
|
DF_COMMENTARIES_PATH = './commentaries/data_parsed/'
|
|
df_monarchia = pd.read_csv(DF_COMMENTARIES_PATH + 'monarchia_DF.csv')
|
|
df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
|
|
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
|
|
|
|
|
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
|
|
|
|
model = spacy.load(SPACY_MODEL_STD)
|
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
|
|
|
ner = model.get_pipe('ner')
|
|
|
|
for _, annotations in TRAIN_DATA:
|
|
for ent in annotations.get("entities"):
|
|
ner.add_label(ent[2])
|
|
|
|
disabled = model.disable_pipes("tagger", "parser")
|
|
|
|
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
|
other_pipes = [pipe for pipe in model.pipe_names if pipe not in pipe_exceptions]
|
|
|
|
with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
|
print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
|
|
# warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
|
|
|
optimizer = model.resume_training()
|
|
|
|
n_epochs = 7
|
|
#batch_size = 32
|
|
print(f'\n## Begin Training')
|
|
for i in tqdm(range(n_epochs), desc='Iter'):
|
|
#print(f'Iteration {i+1}')
|
|
losses = {}
|
|
random.shuffle(TRAIN_DATA)
|
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
for batch in batches:
|
|
docs, golds = zip(*batch)
|
|
model.update(docs, golds, sgd=optimizer, losses=losses)
|
|
print(f'Final loss: {losses}')
|
|
|
|
seed = random.randint(1, len(clean_commentaries))
|
|
|
|
def pprint_com(comment, l=100):
|
|
i = 0
|
|
while len(comment) > i+100:
|
|
j = i+l
|
|
print(comment[i:j])
|
|
i += 100
|
|
print(comment[i:len(comment)])
|
|
|
|
disabled.restore()
|
|
|
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
|
|
|
eg_eval = df_eval.iloc[seed]
|
|
test_text = eg_eval['comment']
|
|
clean_comment = test_text.replace('<i>', '')
|
|
clean_comment = clean_comment.replace('</i>', '')
|
|
clean_comment = clean_comment.replace('\\', '')
|
|
|
|
print(f'\nSENTENCE:')
|
|
pprint_com(clean_comment)
|
|
doc = model(clean_comment)
|
|
print('\nFINED-TUNED NER MODEL PREDICTIONS:')
|
|
for ent in doc.ents:
|
|
print(ent.text, ent.label_)
|
|
|
|
"""
|
|
print('\n')
|
|
print('-'*50)
|
|
print('STANDARD NER MODEL PREDICTIONS:')
|
|
nlp_reloaded = spacy.load('it_core_news_sm')
|
|
doc_STD = nlp_reloaded(clean_comment)
|
|
for ent in doc_STD.ents:
|
|
print(ent.text, ent.label_)
|
|
"""
|
|
|
|
print('\n')
|
|
print('-'*50)
|
|
print('GOLD:')
|
|
query = eg_eval['comment']
|
|
gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
|
|
print(gold)
|
|
|
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
|
save_model(model, 'it_dante_new', output_dir)
|
|
|
|
return model
|
|
|
|
|
|
def predict_candidates(model, comment, labels=None):
|
|
|
|
def pprint_com(comment, l=100):
|
|
i = 0
|
|
while len(comment) > i+100:
|
|
j = i+l
|
|
print(comment[i:j])
|
|
i += 100
|
|
print(comment[i:len(comment)])
|
|
|
|
clean_comment = comment.replace('<i>', '')
|
|
clean_comment = clean_comment.replace('</i>', '')
|
|
clean_comment = clean_comment.replace('\\', '')
|
|
|
|
print()
|
|
pprint_com(clean_comment)
|
|
doc = model(clean_comment)
|
|
|
|
print('\n')
|
|
candidates = [(ent.text, ent.label_) for ent in doc.ents]
|
|
#print(candidates)
|
|
|
|
if labels is not None:
|
|
query = comment
|
|
gold = labels[labels['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
|
|
print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
|
|
for i in range(len(gold)):
|
|
elem = gold.iloc[i]
|
|
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
|
|
# print('\n')
|
|
|
|
return candidates, gold
|
|
|
|
|
|
def save_model(model, new_model_name, output_dir):
|
|
# save model to output directory
|
|
if output_dir is not None:
|
|
output_dir = Path(output_dir)
|
|
if not output_dir.exists():
|
|
output_dir.mkdir()
|
|
model.meta["name"] = new_model_name
|
|
model.to_disk(output_dir)
|
|
print("Saved model to", output_dir)
|
|
|
|
|
|
def load_word_vectors(model, path_to_vec, max_vec=100000):
|
|
with open(path_to_vec, 'r') as infile:
|
|
header = infile.readline()
|
|
n_row, n_dim = header.split()
|
|
model.vocab.reset_vectors(width=int(n_dim))
|
|
count = 0
|
|
for _, line in tqdm(enumerate(infile), total=max_vec):
|
|
count += 1
|
|
line = line.rstrip()
|
|
pieces = line.rsplit(' ', int(n_dim))
|
|
word = pieces[0]
|
|
#print("{} - {}".format(count, word))
|
|
vector = np.asarray([float(v) for v in pieces[1:]], dtype='f')
|
|
model.vocab.set_vector(word, vector)
|
|
if count == max_vec:
|
|
break
|
|
|
|
return model
|
|
|
|
|
|
def main():
|
|
df_TRAIN = df_monarchia
|
|
df_eval = df_convivio
|
|
dataset = DataSetBuilder(df_TRAIN, df_eval, df_ner_unique)
|
|
TRAIN_DATA = dataset.import_dataset_doccano('./commentaries/data_parsed/doccano_data/from_doccano_hdn1.json')
|
|
commentaries_convivio_eva = dataset.clean_commentaries_eva
|
|
commentaries_monarchia = dataset.clean_commentaries
|
|
raw_commentaries_convivio = dataset.commentaries_eva
|
|
|
|
# train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
|
|
# nlp = spacy.load('it_core_news_sm')
|
|
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
|
# dataset_convivio = DataSetBuilder(df_eval, df_eval)
|
|
# dataset_convivio.export_dataset_doccano('std_convivio')
|
|
|
|
nlp = spacy.load('./model_fastText/model_spacy_latest')
|
|
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
|
# print(len(list(nlp.vocab.strings))) # get whole model vocabulary
|
|
|
|
seed = random.randint(1, len(commentaries_convivio_eva))
|
|
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
|
|
|
|
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
|
|
kb.link_entities(preds, deepfuzz=True)
|
|
print(f'\nComment Numbert: {seed}')
|
|
|
|
exit()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|