eventExtractionHDN/entity_linker/KB_builder.py

106 lines
3.5 KiB
Python

import pandas as pd
import sparql
import warnings
from wikidata.client import Client # https://github.com/dahlia/wikidata
import tqdm
import json
import time
def testing_dbpedia(author):
endpoint = 'http://dbpedia.org/sparql'
s = sparql.Service(endpoint, "utf-8", "SELECT")
query_author = """SELECT ?names WHERE {{
<{}> owl:sameAs ?names .
}}""".format(author)
query_works = """SELECT ?works WHERE {{
?works dbo:author <{}>
}}""".format(author)
result = s.query(query_author)
results_works = s.query(query_works)
return [result, results_works]
def testing_wikidata(entity_q):
"""
Notable work = P800
Date of birth = P569
Present in work = P1441
"""
# DEBUGGING_ENTITY = 'Q1398'
# entity_q = DEBUGGING_ENTITY
dict_works = {}
dict_present_in_works = {}
client = Client()
entity = client.get(entity_q, load=True)
notable_work = client.get('P800')
present_in_work = client.get('P1441')
# date_of_birth = client.get('P569')
# birth = entity.get(date_of_birth) # TODO: debug this
aut_names = entity.label.texts
_works = entity.get(notable_work)
_present_in_work = entity.get(present_in_work)
if _works is not None:
for work in _works:
dict_works[work.id] = work.label.texts
if _present_in_work is not None:
for p_work in _present_in_work:
dict_present_in_works[p_work.id] = p_work.label.texts
return entity, aut_names, dict_works, dict_present_in_works
def print_results(results):
result, results_works = results[0], results[1]
print('# NAMES:')
for row in result:
values = sparql.unpack_row(row)
print(values[0])
print('# AUTHOR OF:')
for row in results_works:
values = sparql.unpack_row(row)
print(values[0])
def extract_wikidata_endpoint(author_names, show_warnings=True):
r = [sparql.unpack_row(name)[0] for name in author_names if 'wikidata' in sparql.unpack_row(name)[0]]
try:
endpoint = r[0].split('/')[-1]
return endpoint
except IndexError:
if show_warnings:
warnings.warn('Entity has not a wikimdata endpoint ')
return None
# PSEUDO-MAIN ------------------------------------------------------
stime = time.time()
convivio_df = pd.read_csv('../commentaries/data_parsed/convivio_DF.csv')
monarchia_df = pd.read_csv('../commentaries/data_parsed/monarchia_DF.csv')
rime_df = pd.read_csv('../commentaries/data_parsed/rime_DF.csv')
author_uri_list_convivio = list(set(convivio_df['author_uri'].dropna()))
author_uri_list_monarchia = list(set(monarchia_df['author_uri'].values))
author_uri_list_rime = list(set(rime_df['author_uri'].values))
full_auth_list = author_uri_list_convivio + author_uri_list_monarchia + author_uri_list_rime
dict_res = {}
print(f'# Number of authors: {len(full_auth_list)}')
for auth in tqdm.tqdm(full_auth_list):
entity_q = testing_dbpedia(auth)[0]
wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
dict_res[wikidata_endp] = None
if wikidata_endp is not None:
_, names, works, other_works = testing_wikidata(wikidata_endp)
dict_res[wikidata_endp] = {'aut_name': names,
'aut_works': works,
'aut_present_work': other_works}
with open('knowledge_base/KB_wikimedia.json', 'w+') as f:
json.dump(dict_res, f)
print(f'# Process finished in: {round((time.time()-stime), 5)}')