106 lines
3.5 KiB
Python
106 lines
3.5 KiB
Python
import pandas as pd
|
|
import sparql
|
|
import warnings
|
|
from wikidata.client import Client # https://github.com/dahlia/wikidata
|
|
import tqdm
|
|
import json
|
|
import time
|
|
|
|
|
|
def testing_dbpedia(author):
|
|
endpoint = 'http://dbpedia.org/sparql'
|
|
s = sparql.Service(endpoint, "utf-8", "SELECT")
|
|
query_author = """SELECT ?names WHERE {{
|
|
<{}> owl:sameAs ?names .
|
|
}}""".format(author)
|
|
|
|
query_works = """SELECT ?works WHERE {{
|
|
?works dbo:author <{}>
|
|
}}""".format(author)
|
|
|
|
result = s.query(query_author)
|
|
results_works = s.query(query_works)
|
|
return [result, results_works]
|
|
|
|
|
|
def testing_wikidata(entity_q):
|
|
"""
|
|
Notable work = P800
|
|
Date of birth = P569
|
|
Present in work = P1441
|
|
"""
|
|
# DEBUGGING_ENTITY = 'Q1398'
|
|
# entity_q = DEBUGGING_ENTITY
|
|
|
|
dict_works = {}
|
|
dict_present_in_works = {}
|
|
|
|
client = Client()
|
|
entity = client.get(entity_q, load=True)
|
|
notable_work = client.get('P800')
|
|
present_in_work = client.get('P1441')
|
|
# date_of_birth = client.get('P569')
|
|
# birth = entity.get(date_of_birth) # TODO: debug this
|
|
aut_names = entity.label.texts
|
|
_works = entity.get(notable_work)
|
|
_present_in_work = entity.get(present_in_work)
|
|
if _works is not None:
|
|
for work in _works:
|
|
dict_works[work.id] = work.label.texts
|
|
if _present_in_work is not None:
|
|
for p_work in _present_in_work:
|
|
dict_present_in_works[p_work.id] = p_work.label.texts
|
|
return entity, aut_names, dict_works, dict_present_in_works
|
|
|
|
|
|
def print_results(results):
|
|
result, results_works = results[0], results[1]
|
|
print('# NAMES:')
|
|
for row in result:
|
|
values = sparql.unpack_row(row)
|
|
print(values[0])
|
|
|
|
print('# AUTHOR OF:')
|
|
for row in results_works:
|
|
values = sparql.unpack_row(row)
|
|
print(values[0])
|
|
|
|
|
|
def extract_wikidata_endpoint(author_names, show_warnings=True):
|
|
r = [sparql.unpack_row(name)[0] for name in author_names if 'wikidata' in sparql.unpack_row(name)[0]]
|
|
try:
|
|
endpoint = r[0].split('/')[-1]
|
|
return endpoint
|
|
except IndexError:
|
|
if show_warnings:
|
|
warnings.warn('Entity has not a wikimdata endpoint ')
|
|
return None
|
|
|
|
|
|
# PSEUDO-MAIN ------------------------------------------------------
|
|
stime = time.time()
|
|
convivio_df = pd.read_csv('../commentaries/data_parsed/convivio_DF.csv')
|
|
monarchia_df = pd.read_csv('../commentaries/data_parsed/monarchia_DF.csv')
|
|
rime_df = pd.read_csv('../commentaries/data_parsed/rime_DF.csv')
|
|
author_uri_list_convivio = list(set(convivio_df['author_uri'].dropna()))
|
|
author_uri_list_monarchia = list(set(monarchia_df['author_uri'].values))
|
|
author_uri_list_rime = list(set(rime_df['author_uri'].values))
|
|
|
|
full_auth_list = author_uri_list_convivio + author_uri_list_monarchia + author_uri_list_rime
|
|
dict_res = {}
|
|
print(f'# Number of authors: {len(full_auth_list)}')
|
|
for auth in tqdm.tqdm(full_auth_list):
|
|
entity_q = testing_dbpedia(auth)[0]
|
|
wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False)
|
|
dict_res[wikidata_endp] = None
|
|
if wikidata_endp is not None:
|
|
_, names, works, other_works = testing_wikidata(wikidata_endp)
|
|
dict_res[wikidata_endp] = {'aut_name': names,
|
|
'aut_works': works,
|
|
'aut_present_work': other_works}
|
|
|
|
with open('knowledge_base/KB_wikimedia.json', 'w+') as f:
|
|
json.dump(dict_res, f)
|
|
|
|
print(f'# Process finished in: {round((time.time()-stime), 5)}')
|