139 lines
6.0 KiB
Python
139 lines
6.0 KiB
Python
# TODO: work on fuzzy matching. See https://github.com/gandersen101/spaczz
|
|
from difflib import SequenceMatcher
|
|
import json
|
|
import numpy as np
|
|
|
|
|
|
class KnowledgeBase:
|
|
def __init__(self, kb_path, extension=None):
|
|
with open(kb_path, 'rb') as infile:
|
|
data = json.load(infile)
|
|
|
|
self.id2aut = data
|
|
self.aut2id = {}
|
|
self.works2aut = {}
|
|
self._popolate_aut2id()
|
|
if extension is not None:
|
|
self._extend_kb(extension)
|
|
self._popolate_aut2id()
|
|
|
|
def link_entities(self, preds, deepfuzz=False):
|
|
COMMEDIA_DATE = 1321
|
|
PER_preds = [pred[0] for pred in preds if pred[1] == 'PER' and pred[0] != 'Dante']
|
|
WORK_preds = [pred[0] for pred in preds if pred[1] == 'WORK_OF_ART']
|
|
print('\nMODEL RAW PREDICTIONS:')
|
|
print(f'Candidate authors (i.e., entities matched): {PER_preds}')
|
|
print(f'Candidates work :{WORK_preds}')
|
|
print('-'*50 + '\n\nFINAL OUTPUT:\n### Author matches:')
|
|
|
|
"""
|
|
Sorting PER_preds (i.e., entities of type PERSON matched by the NER model in the given commentary) according
|
|
to the edit distance computed across all of the entities' name in the KB. At first, the similarity measure
|
|
is computed between the match and the exact name stored in the KB. Eventually, if deepscore == True, such a
|
|
metric is computed beteen the original NER match and every author name present in the KB but also split on
|
|
the ' ' (i.e., space) in order to deal with name abbreviations (i.e., 'Tommaso d'Aquino' is often referred to as
|
|
simply 'Tommaso' in the commentaries). Once sorted, the first element satisfying the given threshold is returned.
|
|
"""
|
|
aut_res = []
|
|
for target in set(PER_preds):
|
|
scores = []
|
|
deepscore = []
|
|
for auth in self.aut2id.keys():
|
|
sim = self._similar(target, auth)
|
|
scores.append((auth, sim))
|
|
|
|
scores.sort(key=lambda tup: tup[1], reverse=True)
|
|
success = False
|
|
for i in range(3):
|
|
if scores[i][1] > .8:
|
|
print(f'Prediction (F): {target} - {self.id2aut[self.aut2id[scores[i][0]]]["aut_name"]["it"]}')
|
|
success = True
|
|
aut_res.append(target)
|
|
break
|
|
if deepfuzz and not success:
|
|
for aut in self.aut2id.keys():
|
|
_splitname = aut.split(' ')
|
|
sim = 0
|
|
for split in _splitname:
|
|
_sim = self._similar(target, split)
|
|
if _sim > sim:
|
|
sim = _sim
|
|
deepscore.append((aut, sim))
|
|
deepscore.sort(key=lambda tup: tup[1], reverse=True)
|
|
for j in range(3):
|
|
if deepscore[j][1] > .8:
|
|
print(
|
|
f'Prediction (S): {target} - {self.id2aut[self.aut2id[deepscore[j][0]]]["aut_name"]["it"]}')
|
|
aut_res.append(target)
|
|
break
|
|
|
|
"""
|
|
Sorting WORK_preds (i.e., entities of type WORK_OF_ART matched by the NER model in the given commentary)
|
|
according to the edit distance computed across all of the titles in the KB. The similarity measure
|
|
is computed between the match and the exact name stored in the KB.
|
|
Once sorted, the first element satisfying the given threshold is returned.
|
|
"""
|
|
work_res = {}
|
|
if len(WORK_preds) != 0:
|
|
print('-' * 50 + '\n### Works matches:')
|
|
for target in set(WORK_preds):
|
|
scores_work = []
|
|
for work in self.works2aut.keys():
|
|
sim = self._similar(target, work)
|
|
scores_work.append((work, sim))
|
|
scores_work.sort(key=lambda tup: tup[1], reverse=True)
|
|
for i in range(3):
|
|
if scores_work[i][1] > .7:
|
|
print(f'Prediction (F): {target} by: {self.works2aut[scores_work[i][0]]}')
|
|
work_res[target] = self.works2aut[scores_work[i][0]]
|
|
break
|
|
|
|
return aut_res, work_res
|
|
|
|
def _similar(self,a, b):
|
|
return SequenceMatcher(None, a, b).ratio()
|
|
|
|
def _popolate_aut2id(self):
|
|
for qid, values in self.id2aut.items():
|
|
if qid == 'null':
|
|
continue
|
|
if values is not None:
|
|
l_names = set(values['aut_name'].values())
|
|
for name in l_names:
|
|
self.aut2id[name] = qid
|
|
works = values['aut_works']
|
|
if len(works) != 0:
|
|
for wid, wvalues in works.items():
|
|
try:
|
|
self.works2aut[wvalues['it']] = self.id2aut[qid]['aut_name']['it']
|
|
except:
|
|
continue
|
|
|
|
return self
|
|
|
|
def _extend_kb(self, df):
|
|
_qid = 0
|
|
prev_work = ''
|
|
for i in range(len(df)):
|
|
row = df.iloc[i]
|
|
auth = row.quot_author
|
|
work = row.quot_title
|
|
if auth is not np.nan and work is not np.nan:
|
|
if work != prev_work:
|
|
try:
|
|
qid = self.aut2id[auth]
|
|
new_wid = f'W{_qid}'
|
|
_qid += 1
|
|
self.id2aut[qid]['aut_works'][new_wid] = {'it': work}
|
|
prev_work = work
|
|
except:
|
|
new_qid = f'Q{str(_qid)}'
|
|
new_wid = f'W{str(_qid)}'
|
|
_qid += 1
|
|
self.id2aut[new_qid] = {'aut_name': {'it': auth},
|
|
'aut_works': {new_wid: {'it': work}},
|
|
'aut_present_work': {},
|
|
'birth': 0}
|
|
prev_work = work
|
|
else:
|
|
continue |