import pandas as pd import sparql import warnings from wikidata.client import Client # https://github.com/dahlia/wikidata import tqdm import json import time def testing_dbpedia(author): endpoint = 'http://dbpedia.org/sparql' s = sparql.Service(endpoint, "utf-8", "SELECT") query_author = """SELECT ?names WHERE {{ <{}> owl:sameAs ?names . }}""".format(author) query_works = """SELECT ?works WHERE {{ ?works dbo:author <{}> }}""".format(author) result = s.query(query_author) results_works = s.query(query_works) return [result, results_works] def testing_wikidata(entity_q): """ Notable work = P800 Date of birth = P569 Present in work = P1441 """ # DEBUGGING_ENTITY = 'Q1398' # entity_q = DEBUGGING_ENTITY dict_works = {} dict_present_in_works = {} client = Client() entity = client.get(entity_q, load=True) notable_work = client.get('P800') present_in_work = client.get('P1441') # date_of_birth = client.get('P569') # birth = entity.get(date_of_birth) # TODO: debug this aut_names = entity.label.texts _works = entity.get(notable_work) _present_in_work = entity.get(present_in_work) if _works is not None: for work in _works: dict_works[work.id] = work.label.texts if _present_in_work is not None: for p_work in _present_in_work: dict_present_in_works[p_work.id] = p_work.label.texts return entity, aut_names, dict_works, dict_present_in_works def print_results(results): result, results_works = results[0], results[1] print('# NAMES:') for row in result: values = sparql.unpack_row(row) print(values[0]) print('# AUTHOR OF:') for row in results_works: values = sparql.unpack_row(row) print(values[0]) def extract_wikidata_endpoint(author_names, show_warnings=True): r = [sparql.unpack_row(name)[0] for name in author_names if 'wikidata' in sparql.unpack_row(name)[0]] try: endpoint = r[0].split('/')[-1] return endpoint except IndexError: if show_warnings: warnings.warn('Entity has not a wikimdata endpoint ') return None # PSEUDO-MAIN ------------------------------------------------------ stime = time.time() convivio_df = pd.read_csv('../commentaries/data_parsed/convivio_DF.csv') monarchia_df = pd.read_csv('../commentaries/data_parsed/monarchia_DF.csv') rime_df = pd.read_csv('../commentaries/data_parsed/rime_DF.csv') author_uri_list_convivio = list(set(convivio_df['author_uri'].dropna())) author_uri_list_monarchia = list(set(monarchia_df['author_uri'].values)) author_uri_list_rime = list(set(rime_df['author_uri'].values)) full_auth_list = author_uri_list_convivio + author_uri_list_monarchia + author_uri_list_rime dict_res = {} print(f'# Number of authors: {len(full_auth_list)}') for auth in tqdm.tqdm(full_auth_list): entity_q = testing_dbpedia(auth)[0] wikidata_endp = extract_wikidata_endpoint(entity_q, show_warnings=False) dict_res[wikidata_endp] = None if wikidata_endp is not None: _, names, works, other_works = testing_wikidata(wikidata_endp) dict_res[wikidata_endp] = {'aut_name': names, 'aut_works': works, 'aut_present_work': other_works} with open('knowledge_base/KB_wikimedia.json', 'w+') as f: json.dump(dict_res, f) print(f'# Process finished in: {round((time.time()-stime), 5)}')