sshoc-skosmapping/sshoc_31_skos-yarrrml.ipynb

254 lines
8.2 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "markdown",
"id": "mechanical-johns",
"metadata": {},
"source": [
"## Mapping *SSHOC Multilingual Metadata* to SKOS resources \n",
"\n",
2023-03-21 16:59:41 +01:00
"This Notebook implements a simple parser used to transform the SSHOC Multilingual Metadata, created in the Task 3.1 of the SSHOC project and published as spreadsheet, into a SKOS resource. The parser reads the spreadsheet and transforms the content following a set of mapping rules defined using [YRRRML](https://rml.io/yarrrml/) , the result is stored in Turtle files, and downloaded in a Fuseki server."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "aware-comparison",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import rdflib\n",
"import itertools\n",
"import yaml\n",
"import datetime\n",
"import json\n",
"from jsonpath_ng import jsonpath, parse\n",
"from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \\\n",
" RDF, RDFS, SKOS, \\\n",
" XMLNS, XSD, XMLNS\n",
"from rdflib import Namespace\n",
"from rdflib import URIRef, BNode, Literal"
]
},
{
"cell_type": "markdown",
"id": "failing-shift",
"metadata": {},
"source": [
"The file *config.yaml* contains the external information used in the parsing, including the path of the spreadsheets. Set the correct values before running the Notebook.\n",
"The file *mappings.yaml* contains the YARRRML mapping rules"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cutting-triangle",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" with open(\"config.yaml\", 'r') as stream:\n",
" try:\n",
" conf=yaml.safe_load(stream)\n",
" except yaml.YAMLError as exc:\n",
" print(exc)\n",
" with open(\"rules.yaml\", 'r') as stream:\n",
" try:\n",
" rules=yaml.safe_load(stream)\n",
" except yaml.YAMLError as exc:\n",
" print(exc)\n",
"except FileNotFoundError:\n",
" print('Warning config.yaml file not present! Please store it in the same directory as the notebook')\n",
"#print (conf)"
]
},
{
"cell_type": "markdown",
"id": "dd570749",
"metadata": {},
"source": [
"The following functions implement a basic parser that processes the YARRRML rules and creates a RDF graph"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "403b935b",
"metadata": {},
"outputs": [],
"source": [
"def jsonmapper (json_source, rules):\n",
" ccr_y = rdflib.Graph()\n",
" prefixes=rules['prefixes']\n",
" nss={}\n",
" for key in prefixes:\n",
" myns=Namespace(URIRef(prefixes[key]))\n",
" nss[key]=myns\n",
" ccr_y.bind(key, nss[key])\n",
" mappings=rules['mappings']\n",
" for mapi in mappings:\n",
" jsonpath_expression = parse(mappings[mapi]['sources'][0][1])\n",
" source_list = [match.value for match in jsonpath_expression.find(json_source)] \n",
" for source in source_list:\n",
" labelst=mappings[mapi]['s'].split(':')\n",
" if ('$' in labelst[1]):\n",
" labelpath=labelst[1].replace('$(','').replace(')','')\n",
" labelexpression=parse(labelpath)\n",
" lb_ids_list = [match.value for match in labelexpression.find(source)]\n",
" labelid=lb_ids_list[0]\n",
" else:\n",
" labelid=labelst[1]\n",
" labns=nss[labelst[0]]\n",
" urilabel=labns[labelid]\n",
" propsobs=mappings[mapi]['po']\n",
" for popob in propsobs:\n",
" if (popob[0]=='a'):\n",
" myob=popob[1].split(':')\n",
" tpns=nss[myob[0]]\n",
" ccr_y.add((urilabel, RDF.type, tpns[myob[1]]))\n",
" continue\n",
"\n",
" myspath=(f\"{popob[1].replace('$(','').replace(')','')}\")\n",
" po_expression = parse(myspath)\n",
" po_ids_list = [match.value for match in po_expression.find(source)]\n",
" lang=''\n",
" if (len(popob) >2 and ('lang' in popob[2])):\n",
" lang=popob[2].replace('~lang','')\n",
" for poval in po_ids_list:\n",
" ob=Literal((poval))\n",
" if lang!='':\n",
" ob= Literal(ob, lang=lang)\n",
" prns=nss[popob[0].split(':')[0]]\n",
" ccr_y.add((urilabel, prns[popob[0].split(':')[1]], ob))\n",
" return ccr_y\n",
" "
]
},
{
"cell_type": "markdown",
"id": "ruled-america",
"metadata": {},
"source": [
"Download **SSHOC Multilingual Metadata** spreadsheet"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "olive-archive",
"metadata": {},
"outputs": [],
"source": [
"mdurl=conf['Source']['METADATASOURCE']\n",
"df_metadata=pd.read_csv(mdurl)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "square-michael",
"metadata": {},
"outputs": [],
"source": [
"df_metadata.rename(columns = {'English': 'Englishterm', 'Unnamed: 1':'Englishdefinition', 'Unnamed: 2':'source',\n",
" 'Unnamed: 3':'URI', 'Dutch':'Dutchterm', 'Unnamed: 5':'Dutchdefinition', \n",
" 'French':'Frenchterm', 'Unnamed: 7':'Frenchdefinition',\n",
" 'Greek':'Greekterm', 'Unnamed: 9':'Greekdefinition',\n",
" 'Italian':'Italianterm', 'Unnamed: 11':'Italiandefinition'}, inplace = True)\n",
"df_metadata=df_metadata.drop(0)\n",
"df_metadata['ConceptId']=df_metadata['URI'].apply(lambda y: y.replace('http://hdl.handle.net/11459/',''))\n",
"df_metadata['source']=df_metadata['source'].apply(lambda y: y.replace('(source: ','').replace(')',''))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ff9828c8",
"metadata": {},
"outputs": [],
"source": [
"df_metadata.to_json('data/file.json', orient='records')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ebdd5eb0",
"metadata": {},
"outputs": [],
"source": [
"myjson=df_metadata.to_json(orient='records')\n",
"concepts_metadata=json.loads(myjson)\n",
"json_metadata={'concepts': concepts_metadata}\n",
"json_metadata['title']=conf['Texts']['METADATATITLE']\n",
"json_metadata['description']=conf['Texts']['METADATADESCRIPTION']\n",
"json_metadata['id']=conf['Texts']['METADATAID']\n",
"json_metadata['createdate']=conf['Texts']['METADATACREATEDATE']\n",
"json_metadata['version']=conf['Texts']['METADATAVERSION']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "abd396d6",
"metadata": {},
"outputs": [],
"source": [
"skosgraph=jsonmapper(json_metadata, rules)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a0a6b04a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Graph identifier=Nd89e673445d645f69476105bb18cad6a (<class 'rdflib.graph.Graph'>)>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"skosgraph.serialize(destination='data/skosccr_y.ttl', format=\"n3\")\n",
"skosgraph.serialize(destination='data/skosccr_y.rdf', format=\"pretty-xml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "elementary-graphics",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}