function to extend KB with data from annotated commentaries
This commit is contained in:
parent
fb84b36b90
commit
2324ddff9f
|
|
@ -1,30 +1,33 @@
|
||||||
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
|
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
import json
|
import json
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeBase:
|
class KnowledgeBase:
|
||||||
|
def __init__(self, kb_path, extension=None):
|
||||||
def __init__(self, kb_path):
|
|
||||||
with open(kb_path, 'rb') as infile:
|
with open(kb_path, 'rb') as infile:
|
||||||
data = json.load(infile)
|
data = json.load(infile)
|
||||||
|
|
||||||
self.id2aut = data
|
self.id2aut = data
|
||||||
self.aut2id = {}
|
self.aut2id = {}
|
||||||
|
self.works2aut = {}
|
||||||
self._popolate_aut2id()
|
self._popolate_aut2id()
|
||||||
|
if extension is not None:
|
||||||
|
self._extend_kb(extension)
|
||||||
|
self._popolate_aut2id()
|
||||||
|
|
||||||
def link_entities(self, preds, deepfuzz=False):
|
def link_entities(self, preds, deepfuzz=False):
|
||||||
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
|
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
|
||||||
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
|
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
|
||||||
print('-'*50)
|
# print('-'*50)
|
||||||
print(f'Candidate authors (i.e., entitites matched): {PER_preds}')
|
# print(f'Candidate authors (i.e., entities matched): {PER_preds}')
|
||||||
# print(f'Candidates work:\n{WORK_preds}')
|
# print(f'Candidates work :{WORK_preds}')
|
||||||
|
|
||||||
COMMEDIA_DATE = 1321
|
COMMEDIA_DATE = 1321
|
||||||
print('-'*50 + '\nChecking in KB...')
|
print('-'*50 + '\n\nOUTPUT:\n### Author matches:')
|
||||||
|
|
||||||
# TODO: in the author dict I should insert also the single name (e.g., Tommaso --> Tommaso d'Aquino)
|
|
||||||
|
|
||||||
|
aut_res = []
|
||||||
for target in set(PER_preds):
|
for target in set(PER_preds):
|
||||||
scores = []
|
scores = []
|
||||||
deepscore = []
|
deepscore = []
|
||||||
|
|
@ -36,8 +39,10 @@ class KnowledgeBase:
|
||||||
success = False
|
success = False
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
if scores[i][1] > .8:
|
if scores[i][1] > .8:
|
||||||
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"], scores[i][1]} - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
|
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
|
||||||
|
#, scores[i][1]}') # - born in {self.id2aut[self.aut2id[scores[i][0]]]["birth"]}')
|
||||||
success = True
|
success = True
|
||||||
|
aut_res.append(target)
|
||||||
break
|
break
|
||||||
if deepfuzz and not success:
|
if deepfuzz and not success:
|
||||||
for aut in self.aut2id.keys():
|
for aut in self.aut2id.keys():
|
||||||
|
|
@ -50,38 +55,74 @@ class KnowledgeBase:
|
||||||
deepscore.append((aut, sim))
|
deepscore.append((aut, sim))
|
||||||
deepscore.sort(key=lambda tup: tup[1], reverse=True)
|
deepscore.sort(key=lambda tup: tup[1], reverse=True)
|
||||||
for j in range(3):
|
for j in range(3):
|
||||||
if deepscore[j][1] > .8:
|
if deepscore[j][1] > .9:
|
||||||
print(
|
print(
|
||||||
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"], deepscore[j][1]} - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
|
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
|
||||||
|
#, deepscore[j][1]}') # - born in {self.id2aut[self.aut2id[deepscore[j][0]]]["birth"]}')
|
||||||
|
aut_res.append(target)
|
||||||
break
|
break
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def _generate_utter_2_ent(self):
|
work_res = {}
|
||||||
utt_2_ent = {}
|
if len(WORK_preds) != 0:
|
||||||
for ent_en in self.kb.keys():
|
print('-' * 50 + '\n### Works matches:')
|
||||||
for utt in self.kb[ent_en]['names']:
|
for target in set(WORK_preds):
|
||||||
utt_2_ent[utt] = ent_en
|
scores_work = []
|
||||||
return utt_2_ent
|
for work in self.works2aut.keys():
|
||||||
|
sim = self._similar(target, work)
|
||||||
|
scores_work.append((work, sim))
|
||||||
|
scores_work.sort(key=lambda tup: tup[1], reverse=True)
|
||||||
|
for i in range(3):
|
||||||
|
if scores_work[i][1] > .75:
|
||||||
|
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
|
||||||
|
work_res[target] = self.works2aut[scores_work[i][0]]
|
||||||
|
break
|
||||||
|
|
||||||
def _check_other_lang(self, target, original_name):
|
return aut_res, work_res
|
||||||
other_names = self.kb[target[0]]['names']
|
|
||||||
|
|
||||||
scores = []
|
|
||||||
for name in other_names:
|
|
||||||
sim = self._similar(original_name, name)
|
|
||||||
scores.append((name, sim))
|
|
||||||
scores.sort(key=lambda tup: tup[1], reverse=True)
|
|
||||||
return scores
|
|
||||||
|
|
||||||
def _similar(self,a, b):
|
def _similar(self,a, b):
|
||||||
return SequenceMatcher(None, a, b).ratio()
|
return SequenceMatcher(None, a, b).ratio()
|
||||||
|
|
||||||
def _popolate_aut2id(self):
|
def _popolate_aut2id(self):
|
||||||
for qid, values in self.id2aut.items():
|
for qid, values in self.id2aut.items():
|
||||||
|
if qid == 'null':
|
||||||
|
continue
|
||||||
if values is not None:
|
if values is not None:
|
||||||
l_names = set(values['aut_name'].values())
|
l_names = set(values['aut_name'].values())
|
||||||
for name in l_names:
|
for name in l_names:
|
||||||
self.aut2id[name] = qid
|
self.aut2id[name] = qid
|
||||||
|
works = values['aut_works']
|
||||||
|
if len(works) != 0:
|
||||||
|
for wid, wvalues in works.items():
|
||||||
|
try:
|
||||||
|
self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def _extend_kb(self, df):
|
||||||
|
_qid = 0
|
||||||
|
prev_work = ''
|
||||||
|
for i in range(len(df)):
|
||||||
|
row = df.iloc[i]
|
||||||
|
auth = row.quot_author
|
||||||
|
work = row.quot_title
|
||||||
|
if auth is not np.nan and work is not np.nan:
|
||||||
|
if work != prev_work:
|
||||||
|
try:
|
||||||
|
qid = self.aut2id[auth]
|
||||||
|
new_wid = f'W{_qid}'
|
||||||
|
_qid += 1
|
||||||
|
self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
|
||||||
|
prev_work = work
|
||||||
|
except:
|
||||||
|
new_qid = f'Q{str(_qid)}'
|
||||||
|
new_wid = f'W{str(_qid)}'
|
||||||
|
_qid += 1
|
||||||
|
self.id2aut[new_qid] = {'aut_name': {'it': auth},
|
||||||
|
'aut_works': {new_wid: {'it': work}},
|
||||||
|
'aut_present_work': {},
|
||||||
|
'birth': 0}
|
||||||
|
prev_work = work
|
||||||
|
else:
|
||||||
|
continue
|
||||||
86
main.py
86
main.py
|
|
@ -5,9 +5,8 @@ from spacy.util import minibatch, compounding
|
||||||
import warnings
|
import warnings
|
||||||
from preprocessing.ner_dataset_builder import DataSetBuilder
|
from preprocessing.ner_dataset_builder import DataSetBuilder
|
||||||
from entity_linker.knowledge_base import KnowledgeBase
|
from entity_linker.knowledge_base import KnowledgeBase
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm, trange
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pickle
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -18,7 +17,16 @@ df_convivio = pd.read_csv(DF_COMMENTARIES_PATH + 'convivio_DF.csv')
|
||||||
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
df_ner_unique = pd.read_csv(DF_COMMENTARIES_PATH + 'ner_unique_monarchia.csv')
|
||||||
|
|
||||||
|
|
||||||
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL_STD='it_core_news_sm'):
|
def pprint_com(comment, l=100):
|
||||||
|
i = 0
|
||||||
|
while len(comment) > i + 100:
|
||||||
|
j = i + l
|
||||||
|
print(comment[i:j])
|
||||||
|
i += 100
|
||||||
|
print(comment[i:len(comment)])
|
||||||
|
|
||||||
|
|
||||||
|
def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, nepochs=50, SPACY_MODEL_STD='it_core_news_sm'):
|
||||||
|
|
||||||
model = spacy.load(SPACY_MODEL_STD)
|
model = spacy.load(SPACY_MODEL_STD)
|
||||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||||
|
|
@ -36,35 +44,22 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
|
||||||
|
|
||||||
with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
with model.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||||
print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
|
print(f'Enabled pipes at training: {[pipe for pipe in model.pipe_names]}')
|
||||||
# warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
|
||||||
|
|
||||||
optimizer = model.resume_training()
|
optimizer = model.resume_training()
|
||||||
|
n_epochs = nepochs
|
||||||
n_epochs = 7
|
|
||||||
#batch_size = 32
|
|
||||||
print(f'\n## Begin Training')
|
print(f'\n## Begin Training')
|
||||||
for i in tqdm(range(n_epochs), desc='Iter'):
|
t = trange(n_epochs, desc='Iter')
|
||||||
#print(f'Iteration {i+1}')
|
for i in t:
|
||||||
losses = {}
|
losses = {}
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
model.update(docs, golds, sgd=optimizer, losses=losses)
|
model.update(docs, golds, sgd=optimizer, losses=losses)
|
||||||
|
t.set_description(f'NER loss: {round(losses["ner"], 5)}')
|
||||||
print(f'Final loss: {losses}')
|
print(f'Final loss: {losses}')
|
||||||
|
|
||||||
seed = random.randint(1, len(clean_commentaries))
|
seed = random.randint(1, len(clean_commentaries))
|
||||||
|
|
||||||
def pprint_com(comment, l=100):
|
|
||||||
i = 0
|
|
||||||
while len(comment) > i+100:
|
|
||||||
j = i+l
|
|
||||||
print(comment[i:j])
|
|
||||||
i += 100
|
|
||||||
print(comment[i:len(comment)])
|
|
||||||
|
|
||||||
disabled.restore()
|
disabled.restore()
|
||||||
|
|
||||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||||
|
|
||||||
eg_eval = df_eval.iloc[seed]
|
eg_eval = df_eval.iloc[seed]
|
||||||
|
|
@ -80,21 +75,11 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
print(ent.text, ent.label_)
|
print(ent.text, ent.label_)
|
||||||
|
|
||||||
"""
|
|
||||||
print('\n')
|
|
||||||
print('-'*50)
|
|
||||||
print('STANDARD NER MODEL PREDICTIONS:')
|
|
||||||
nlp_reloaded = spacy.load('it_core_news_sm')
|
|
||||||
doc_STD = nlp_reloaded(clean_comment)
|
|
||||||
for ent in doc_STD.ents:
|
|
||||||
print(ent.text, ent.label_)
|
|
||||||
"""
|
|
||||||
|
|
||||||
print('\n')
|
print('\n')
|
||||||
print('-'*50)
|
print('-'*50)
|
||||||
print('GOLD:')
|
print('GOLD:')
|
||||||
query = eg_eval['comment']
|
query = eg_eval['comment']
|
||||||
gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author', 'quot_type', 'quot_uri']]
|
gold = df_eval[df_eval['comment'] == query][['quot_title', 'quot_author']]
|
||||||
print(gold)
|
print(gold)
|
||||||
|
|
||||||
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
print(f'Enabled pipes: {[pipe for pipe in model.pipe_names]}')
|
||||||
|
|
@ -104,15 +89,6 @@ def train_model(TRAIN_DATA, clean_commentaries, df_eval, output_dir, SPACY_MODEL
|
||||||
|
|
||||||
|
|
||||||
def predict_candidates(model, comment, labels=None):
|
def predict_candidates(model, comment, labels=None):
|
||||||
|
|
||||||
def pprint_com(comment, l=100):
|
|
||||||
i = 0
|
|
||||||
while len(comment) > i+100:
|
|
||||||
j = i+l
|
|
||||||
print(comment[i:j])
|
|
||||||
i += 100
|
|
||||||
print(comment[i:len(comment)])
|
|
||||||
|
|
||||||
clean_comment = comment.replace('<i>', '')
|
clean_comment = comment.replace('<i>', '')
|
||||||
clean_comment = clean_comment.replace('</i>', '')
|
clean_comment = clean_comment.replace('</i>', '')
|
||||||
clean_comment = clean_comment.replace('\\', '')
|
clean_comment = clean_comment.replace('\\', '')
|
||||||
|
|
@ -123,7 +99,6 @@ def predict_candidates(model, comment, labels=None):
|
||||||
|
|
||||||
print('\n')
|
print('\n')
|
||||||
candidates = [(ent.text, ent.label_) for ent in doc.ents]
|
candidates = [(ent.text, ent.label_) for ent in doc.ents]
|
||||||
#print(candidates)
|
|
||||||
|
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
query = comment
|
query = comment
|
||||||
|
|
@ -131,7 +106,7 @@ def predict_candidates(model, comment, labels=None):
|
||||||
print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
|
print(f'{len(gold)} GOLD TARGETS ' + '-'*50)
|
||||||
for i in range(len(gold)):
|
for i in range(len(gold)):
|
||||||
elem = gold.iloc[i]
|
elem = gold.iloc[i]
|
||||||
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}')
|
print(f'Title: {elem["quot_title"]}\nAuthor:{elem["quot_author"]}\nType: {elem["quot_type"]}\n{"-"*20}')
|
||||||
# print('\n')
|
# print('\n')
|
||||||
|
|
||||||
return candidates, gold
|
return candidates, gold
|
||||||
|
|
@ -168,6 +143,13 @@ def load_word_vectors(model, path_to_vec, max_vec=100000):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def connect_aut_work(list_aut, list_work, kb):
|
||||||
|
print('\n\nTODO')
|
||||||
|
# qid_list = [kb.aut2id[author] for author in list_aut]
|
||||||
|
# wid_list = [kb.works2aut[work] for work in list_work]
|
||||||
|
# print('lel')
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
df_TRAIN = df_monarchia
|
df_TRAIN = df_monarchia
|
||||||
df_eval = df_convivio
|
df_eval = df_convivio
|
||||||
|
|
@ -178,21 +160,21 @@ def main():
|
||||||
raw_commentaries_convivio = dataset.commentaries_eva
|
raw_commentaries_convivio = dataset.commentaries_eva
|
||||||
|
|
||||||
# train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
|
# train_model(TRAIN_DATA, commentaries_convivio_eva, df_eval, './model_fastText/model_spacy_latest')
|
||||||
# nlp = spacy.load('it_core_news_sm')
|
|
||||||
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
|
||||||
# dataset_convivio = DataSetBuilder(df_eval, df_eval)
|
|
||||||
# dataset_convivio.export_dataset_doccano('std_convivio')
|
|
||||||
|
|
||||||
nlp = spacy.load('./model_fastText/model_spacy_latest')
|
nlp = spacy.load('./model_fastText/model_spacy_latest')
|
||||||
# nlp = load_word_vectors(nlp, './embeddings/cc.it.300.vec', 50000)
|
|
||||||
# print(len(list(nlp.vocab.strings))) # get whole model vocabulary
|
|
||||||
|
|
||||||
seed = random.randint(1, len(commentaries_convivio_eva))
|
seed = random.randint(1, len(commentaries_convivio_eva))
|
||||||
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
|
preds, df_gold = predict_candidates(nlp, raw_commentaries_convivio[seed], df_eval)
|
||||||
|
|
||||||
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json')
|
kb = KnowledgeBase('entity_linker/knowledge_base/KB_wikimedia_with_dates.json', extension=df_eval)
|
||||||
kb.link_entities(preds, deepfuzz=True)
|
aut_res, work_res = kb.link_entities(preds, deepfuzz=True)
|
||||||
print(f'\nComment Numbert: {seed}')
|
|
||||||
|
# Testing ------------------------
|
||||||
|
# connect_aut_work(aut_res, work_res, kb)
|
||||||
|
# --------------------------------
|
||||||
|
|
||||||
|
print(f'\nComment Number: {seed}')
|
||||||
|
|
||||||
|
# TODO: add a matcher that returns s_char and end_char of the matched entities!
|
||||||
|
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -92,7 +92,6 @@ class DataSetBuilder:
|
||||||
self.ner_clean_lookup = ner_clean_lookup
|
self.ner_clean_lookup = ner_clean_lookup
|
||||||
|
|
||||||
return ner_lookup, ner_clean_lookup
|
return ner_lookup, ner_clean_lookup
|
||||||
|
|
||||||
|
|
||||||
def _annotate_commentaries(self):
|
def _annotate_commentaries(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -124,7 +123,6 @@ class DataSetBuilder:
|
||||||
matches_in_clean_commentaries.append(res)
|
matches_in_clean_commentaries.append(res)
|
||||||
|
|
||||||
return matches_in_commentaries, matches_in_clean_commentaries
|
return matches_in_commentaries, matches_in_clean_commentaries
|
||||||
|
|
||||||
|
|
||||||
def build_train_data(self):
|
def build_train_data(self):
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
@ -142,7 +140,6 @@ class DataSetBuilder:
|
||||||
self.TRAIN_DATA = TRAIN_DATA
|
self.TRAIN_DATA = TRAIN_DATA
|
||||||
return TRAIN_DATA
|
return TRAIN_DATA
|
||||||
|
|
||||||
|
|
||||||
def get_rehearsal_data(self):
|
def get_rehearsal_data(self):
|
||||||
revision_data = []
|
revision_data = []
|
||||||
print('# NB: TAGGING WITH standard spacy model!')
|
print('# NB: TAGGING WITH standard spacy model!')
|
||||||
|
|
@ -155,13 +152,12 @@ class DataSetBuilder:
|
||||||
self.revision_data = revision_data
|
self.revision_data = revision_data
|
||||||
return revision_data
|
return revision_data
|
||||||
|
|
||||||
|
|
||||||
def train_model(self):
|
def train_model(self):
|
||||||
from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/
|
from toolz import partition_all # See Docs @ https://toolz.readthedocs.io/en/latest/
|
||||||
import random
|
import random
|
||||||
|
|
||||||
nlp_std = spacy.load(self.SPACY_MODEL_STD)
|
nlp_std = spacy.load(self.SPACY_MODEL_STD)
|
||||||
|
|
||||||
#revision_data = self.get_rehearsal_data()
|
#revision_data = self.get_rehearsal_data()
|
||||||
|
|
||||||
#REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150]
|
#REHEARSAL_DATA = self.TRAIN_DATA[:300] + revision_data[:150]
|
||||||
|
|
@ -178,10 +174,10 @@ class DataSetBuilder:
|
||||||
other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions]
|
other_pipes = [pipe for pipe in nlp_std.pipe_names if pipe not in pipe_exceptions]
|
||||||
|
|
||||||
optimizer = nlp_std.resume_training()
|
optimizer = nlp_std.resume_training()
|
||||||
|
|
||||||
with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
with nlp_std.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||||
|
|
||||||
n_epochs = 10
|
n_epochs = 10
|
||||||
#batch_size = 32
|
#batch_size = 32
|
||||||
print(f'\n## Begin Training')
|
print(f'\n## Begin Training')
|
||||||
|
|
@ -191,9 +187,7 @@ class DataSetBuilder:
|
||||||
random.shuffle(REHEARSAL_DATA)
|
random.shuffle(REHEARSAL_DATA)
|
||||||
batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(REHEARSAL_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
#for batch in partition_all(batch_size, REHEARSAL_DATA):
|
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
#texts, annotations = zip(*batch)
|
|
||||||
nlp_std.update(docs, golds, sgd=optimizer, losses=losses)
|
nlp_std.update(docs, golds, sgd=optimizer, losses=losses)
|
||||||
print(f'loss: {losses}')
|
print(f'loss: {losses}')
|
||||||
|
|
||||||
|
|
@ -221,7 +215,6 @@ class DataSetBuilder:
|
||||||
for ent in doc_STD.ents:
|
for ent in doc_STD.ents:
|
||||||
print(ent.text, ent.label_)
|
print(ent.text, ent.label_)
|
||||||
|
|
||||||
|
|
||||||
def export_dataset_doccano(self, outputfile_name):
|
def export_dataset_doccano(self, outputfile_name):
|
||||||
"""
|
"""
|
||||||
Doccano JSONL data format:
|
Doccano JSONL data format:
|
||||||
|
|
@ -239,7 +232,6 @@ class DataSetBuilder:
|
||||||
with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer:
|
with jsonlines.open(f'./commentaries/data_parsed/doccano_data/{outputfile_name}.jsonl', mode='w') as writer:
|
||||||
writer.write_all(output)
|
writer.write_all(output)
|
||||||
|
|
||||||
|
|
||||||
def merge_rehearsed(self, data, revision_data):
|
def merge_rehearsed(self, data, revision_data):
|
||||||
res = []
|
res = []
|
||||||
revision_data = revision_data
|
revision_data = revision_data
|
||||||
|
|
@ -258,7 +250,6 @@ class DataSetBuilder:
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def import_dataset_doccano(self, path):
|
def import_dataset_doccano(self, path):
|
||||||
data = []
|
data = []
|
||||||
with open(path) as infile:
|
with open(path) as infile:
|
||||||
|
|
@ -266,20 +257,20 @@ class DataSetBuilder:
|
||||||
|
|
||||||
for line in content:
|
for line in content:
|
||||||
json_data = json.loads(line)
|
json_data = json.loads(line)
|
||||||
ent = {'entities':[]}
|
ent = {'entities': []}
|
||||||
ent['entities'] = json_data['labels']
|
ent['entities'] = json_data['labels']
|
||||||
data.append((json_data['text'], ent))
|
data.append((json_data['text'], ent))
|
||||||
|
|
||||||
self.TRAIN_DATA = data
|
self.TRAIN_DATA = data
|
||||||
return data
|
return data
|
||||||
|
|
||||||
if __name__ == '__main__':
|
# if __name__ == '__main__':
|
||||||
data = DataSetBuilder(df_commentary, df_ner_unique)
|
# data = DataSetBuilder(df_commentary, df_ner_unique)
|
||||||
|
#
|
||||||
ner_lookup, ner_clean_lookup = data.get_NER_lookup()
|
# ner_lookup, ner_clean_lookup = data.get_NER_lookup()
|
||||||
data.get_commentaries()
|
# data.get_commentaries()
|
||||||
#data.build_train_data()
|
# data.build_train_data()
|
||||||
#data.train_model()
|
# data.train_model()
|
||||||
#data.export_dataset_doccano()
|
# data.export_dataset_doccano()
|
||||||
#data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
|
# data.import_dataset_doccano('./commentaries/data_parsed/doccano_data/file.json1')
|
||||||
#data.train_model()
|
# data.train_model()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue