Uso di YARRRML per definire regole di mapping

This commit is contained in:
cesare 2023-03-21 09:29:59 +01:00
parent 1054e4f877
commit 10675a2d56
2 changed files with 294 additions and 0 deletions

40
rules.yaml Normal file
View File

@ -0,0 +1,40 @@
prefixes:
dc: http://purl.org/dc/elements/1.1/
dct: http://purl.org/dc/terms
iso369-3: http://id.loc.gov/vocabulary/iso639-3
owl: http://www.w3.org/2002/07/owl#
skos: http://www.w3.org/2004/02/skos/core#
sshoccmd: http://sshoc.eu/XXX/
xsd: http://www.w3.org/2001/XMLSchema#
mappings:
conceptscheme:
sources:
- ['data.json~jsonpath', '$']
s: skos:_
po:
- [a, skos:conceptScheme]
- [dc:title, $(title), en~lang]
- [dc:description, $(description), en~lang]
- [dc:version, $(version), xsd:integer]
concept:
sources:
- ['data.json~jsonpath', '$.concepts[*]']
s: sshoccmd:$(ConceptId)
po:
- [a, skos:Concept]
- [skos:exactMatch, $(URI)]
- [dct:source, $(source) ]
- [skos:preflabel, $(Englishterm), en~lang]
- [skos:definition, $(Englishdefinition), en~lang]
- [skos:preflabel, $(Dutchterm), nl~lang]
- [skos:definition, $(Dutchdefinition), nl~lang]
- [skos:preflabel, $(Greekterm), gr~lang]
- [skos:definition, $(Greekhdefinition), gr~lang]
- [skos:preflabel, $(Italianterm), it~lang]
- [skos:definition, $(Italiandefinition), it~lang]
- [skos:preflabel, $(Frenchterm), fr~lang]
- [skos:definition, $(Frenchdefinition), fr~lang]

254
sshoc_31_skos-yarrrml.ipynb Normal file
View File

@ -0,0 +1,254 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "mechanical-johns",
"metadata": {},
"source": [
"## Mapping *SSHOC Multilingual Metadata* to SKOS resources \n",
"\n",
"This Notebook implements a simple parser used to transform the SSHOC Multilingual Metadata, created in the Task 3.1 of the SSHOC project and published as spreadsheets, into SKOS resources. The parser reads the spreadsheet and transforms the content in SKOS data following a set of mapping rules defined using [YRRRML](https://rml.io/yarrrml/) , the result is stored in Turtle files, and downloaded in a Fuseki server.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "aware-comparison",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import rdflib\n",
"import itertools\n",
"import yaml\n",
"import datetime\n",
"import json\n",
"from jsonpath_ng import jsonpath, parse\n",
"from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \\\n",
" RDF, RDFS, SKOS, \\\n",
" XMLNS, XSD, XMLNS\n",
"from rdflib import Namespace\n",
"from rdflib import URIRef, BNode, Literal"
]
},
{
"cell_type": "markdown",
"id": "failing-shift",
"metadata": {},
"source": [
"The file *config.yaml* contains the external information used in the parsing, including the path of the spreadsheets. Set the correct values before running the Notebook.\n",
"The file *mappings.yaml* contains the YARRRML mapping rules"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cutting-triangle",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" with open(\"config.yaml\", 'r') as stream:\n",
" try:\n",
" conf=yaml.safe_load(stream)\n",
" except yaml.YAMLError as exc:\n",
" print(exc)\n",
" with open(\"rules.yaml\", 'r') as stream:\n",
" try:\n",
" rules=yaml.safe_load(stream)\n",
" except yaml.YAMLError as exc:\n",
" print(exc)\n",
"except FileNotFoundError:\n",
" print('Warning config.yaml file not present! Please store it in the same directory as the notebook')\n",
"#print (conf)"
]
},
{
"cell_type": "markdown",
"id": "dd570749",
"metadata": {},
"source": [
"The following functions implement a basic parser that processes the YARRRML rules and creates a RDF graph"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "403b935b",
"metadata": {},
"outputs": [],
"source": [
"def jsonmapper (json_source, rules):\n",
" ccr_y = rdflib.Graph()\n",
" prefixes=rules['prefixes']\n",
" nss={}\n",
" for key in prefixes:\n",
" myns=Namespace(URIRef(prefixes[key]))\n",
" nss[key]=myns\n",
" ccr_y.bind(key, nss[key])\n",
" mappings=rules['mappings']\n",
" for mapi in mappings:\n",
" jsonpath_expression = parse(mappings[mapi]['sources'][0][1])\n",
" source_list = [match.value for match in jsonpath_expression.find(json_source)] \n",
" for source in source_list:\n",
" labelst=mappings[mapi]['s'].split(':')\n",
" if ('$' in labelst[1]):\n",
" labelpath=labelst[1].replace('$(','').replace(')','')\n",
" labelexpression=parse(labelpath)\n",
" lb_ids_list = [match.value for match in labelexpression.find(source)]\n",
" labelid=lb_ids_list[0]\n",
" else:\n",
" labelid=labelst[1]\n",
" labns=nss[labelst[0]]\n",
" urilabel=labns[labelid]\n",
" propsobs=mappings[mapi]['po']\n",
" for popob in propsobs:\n",
" if (popob[0]=='a'):\n",
" myob=popob[1].split(':')\n",
" tpns=nss[myob[0]]\n",
" ccr_y.add((urilabel, RDF.type, tpns[myob[1]]))\n",
" continue\n",
"\n",
" myspath=(f\"{popob[1].replace('$(','').replace(')','')}\")\n",
" po_expression = parse(myspath)\n",
" po_ids_list = [match.value for match in po_expression.find(source)]\n",
" lang=''\n",
" if (len(popob) >2 and ('lang' in popob[2])):\n",
" lang=popob[2].replace('~lang','')\n",
" for poval in po_ids_list:\n",
" ob=Literal((poval))\n",
" if lang!='':\n",
" ob= Literal(ob, lang=lang)\n",
" prns=nss[popob[0].split(':')[0]]\n",
" ccr_y.add((urilabel, prns[popob[0].split(':')[1]], ob))\n",
" return ccr_y\n",
" "
]
},
{
"cell_type": "markdown",
"id": "ruled-america",
"metadata": {},
"source": [
"Download **SSHOC Multilingual Metadata** spreadsheet"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "olive-archive",
"metadata": {},
"outputs": [],
"source": [
"mdurl=conf['Source']['METADATASOURCE']\n",
"df_metadata=pd.read_csv(mdurl)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "square-michael",
"metadata": {},
"outputs": [],
"source": [
"df_metadata.rename(columns = {'English': 'Englishterm', 'Unnamed: 1':'Englishdefinition', 'Unnamed: 2':'source',\n",
" 'Unnamed: 3':'URI', 'Dutch':'Dutchterm', 'Unnamed: 5':'Dutchdefinition', \n",
" 'French':'Frenchterm', 'Unnamed: 7':'Frenchdefinition',\n",
" 'Greek':'Greekterm', 'Unnamed: 9':'Greekdefinition',\n",
" 'Italian':'Italianterm', 'Unnamed: 11':'Italiandefinition'}, inplace = True)\n",
"df_metadata=df_metadata.drop(0)\n",
"df_metadata['ConceptId']=df_metadata['URI'].apply(lambda y: y.replace('http://hdl.handle.net/11459/',''))\n",
"df_metadata['source']=df_metadata['source'].apply(lambda y: y.replace('(source: ','').replace(')',''))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ff9828c8",
"metadata": {},
"outputs": [],
"source": [
"df_metadata.to_json('data/file.json', orient='records')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ebdd5eb0",
"metadata": {},
"outputs": [],
"source": [
"myjson=df_metadata.to_json(orient='records')\n",
"concepts_metadata=json.loads(myjson)\n",
"json_metadata={'concepts': concepts_metadata}\n",
"json_metadata['title']=conf['Texts']['METADATATITLE']\n",
"json_metadata['description']=conf['Texts']['METADATADESCRIPTION']\n",
"json_metadata['id']=conf['Texts']['METADATAID']\n",
"json_metadata['createdate']=conf['Texts']['METADATACREATEDATE']\n",
"json_metadata['version']=conf['Texts']['METADATAVERSION']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "abd396d6",
"metadata": {},
"outputs": [],
"source": [
"skosgraph=jsonmapper(json_metadata, rules)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a0a6b04a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Graph identifier=Nd89e673445d645f69476105bb18cad6a (<class 'rdflib.graph.Graph'>)>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"skosgraph.serialize(destination='data/skosccr_y.ttl', format=\"n3\")\n",
"skosgraph.serialize(destination='data/skosccr_y.rdf', format=\"pretty-xml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "elementary-graphics",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}